In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
import os
import re
import urllib2

import numpy as np
import random
import time

import json

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

import xml.etree.ElementTree as ET

import nltk

from thesis.utils.text import get_sentences, sentence_wordtokenizer

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
# root_location = "/mnt/data2/shalaby/"
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

In [4]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 29.3 s, sys: 1.54 s, total: 30.8 s
Wall time: 30.8 s


In [5]:
len(training_docs_list)

1286325

#### Extraction Utils 

In [6]:
#ES_URL = 'http://localhost:9200/patents/patent/{}'
ES_URL = 'http://yell.dbs.ifi.lmu.de:9200/patents/patent/{}'
HEADING_TAG = 'heading'
PARAGRAPH_TAG = 'p'
UL_TAG = 'ul'
LI_TAG = 'li'
OL_TAG = 'ol'
DESC_OF_DRAWINGS_TAG = 'description-of-drawings'
MIN_PARAGRAPH_LENGTH = 50

In [7]:
def merge_with_previous(curr_node_tag, previous_node_tag, previous_node_text):
    if curr_node_tag == PARAGRAPH_TAG and previous_node_tag == HEADING_TAG:
        return True
    if previous_node_text and len(previous_node_text) < MIN_PARAGRAPH_LENGTH:
        return True
    return False
    
def get_paragraphs(root):
    paragraphs = []
    previous_node_text = None
    previous_tag = None
    for child in root:
        node_text = None
        if child.tag != DESC_OF_DRAWINGS_TAG:
            node_text = get_node_text(child)
            if node_text.strip():
                if merge_with_previous(child.tag, previous_tag, previous_node_text) and len(paragraphs) > 0:
                    paragraphs[-1] += ' ' + node_text
                else:
                    paragraphs.append(node_text)
        else:
            node_text = extract_desc_of_drawings_paragraph(child)
            paragraphs.append(node_text)
            
        previous_tag = child.tag
        previous_node_text = node_text
    return paragraphs
    
def extract_desc_of_drawings_paragraph(node):
    previous_tag = None
    sentences = []
    for child in node:
        node_text = get_node_text(child)
        if child.tag == PARAGRAPH_TAG and previous_tag == HEADING_TAG:
            sentences[-1] += ' ' + node_text
        else:
            # a paragraph in drawings descriptions is treated as a sentence
            if child.tag == PARAGRAPH_TAG:
                node_text = apply_sentence_end(node_text)
            sentences.append(node_text)
        previous_tag = child.tag
    
    return ' '.join(sentences)

def apply_sentence_end(text):
    if text and text.strip():
        text = text.strip().strip(';.')
        text += '. '
    return text

def itertext_custom(self):
    tag = self.tag
    if not isinstance(tag, basestring) and tag is not None:
        return
    if self.text:
        if tag == LI_TAG:
            yield apply_sentence_end(self.text)
        else:
            yield self.text.replace('\n',' ')
    for e in self:
        for s in e.itertext_custom():
            yield s
        if e.tail:
            yield e.tail

ET.Element.itertext_custom = itertext_custom
# def get_node_text(node):
#     node_text = ''
#     for child in node:
#         # for ul tags, get li tags as sentences
#         if child.tag == UL_TAG:
#             li_sentences = [apply_sentence_end(get_node_text_iterative(c)) for c in child]
#             child_text = ' '.join(li_sentences)
#         else:
#             child_text = get_node_text_iterative(child)
#         node_text += child_text
#     return node_text
        
get_node_text = lambda node: ''.join(node.itertext_custom()).strip()

In [8]:
def conc_paragraphs(parag1, parag2):
    return parag1.strip('.') + '.' + ' ' + parag2

def concatenate_sentences_to_paragraphs(paragraphs):
    """
    for 1 sentence paragraphs, concatenate them to the next or previous paragraph depending on context
    """
    for i in range(len(paragraphs)):
        if i >= len((paragraphs)): break
        parag = paragraphs[i]
        sentences = get_sentences(parag)
        
        if len(sentences) == 1:
            prev_paragraph = paragraphs[i-1] if i-1 >= 0 else None
            next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None

            if (next_paragraph and len(get_sentences(next_paragraph)) == 1):
                # If a series of 1 sentence length paragraphs exist, conc all of them in one paragraph
                while True:
                    if next_paragraph and len(get_sentences(next_paragraph)) == 1:
                        parag = conc_paragraphs(parag, next_paragraph)
                        paragraphs[i] = parag
                        del paragraphs[i+1]

                        # reinitialize for loop
                        next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None
                    else:
                        break

            # otherwise, just concatenate the 1 sentence paragraph to the previous paragraph
            elif prev_paragraph:
#                 print '============== Found prev eligible paragraph'
                prev_paragraph = conc_paragraphs(prev_paragraph, parag)
                paragraphs[i-1] = prev_paragraph
                del paragraphs[i]

            # if this is the first paragraph, then just concatenate it with the next one
            elif next_paragraph:
                parag = conc_paragraphs(parag, next_paragraph)
                paragraphs[i] = parag
                del paragraphs[i+1]

def get_adjusted_paragraphs(root, conc_sentences=True):
    paragraphs = get_paragraphs(root)
    if conc_sentences:
        concatenate_sentences_to_paragraphs(paragraphs)
    return paragraphs

In [9]:
def get_patent(doc_id):
    url_to_fetch = ES_URL.format(doc_id)

    response = urllib2.urlopen(url_to_fetch, timeout=60)
    patent_content = response.read()

    patent_object = json.loads(patent_content)['_source']
    return patent_object

# Actual Extraction

In [10]:
ABSTRACT_ID = "{}_abstract"
DESC_ID = "{}_description"
CLAIMS_ID = "{}_claims"

ABSTRACT_PART_ID = "{}_abstract_part-{}"
DESC_PART_ID = "{}_description_part-{}"
CLAIMS_PART_ID = "{}_claims_part-{}"

In [11]:
BATCH_SIZE = 10000
REPORT_EVERY = 1000

# preprocessed_location = "/mnt/data/shalaby/" + "preprocessed_data/extended_pv_abs_desc_claims_full_chunks/"
preprocessed_location = root_location + "preprocessed_data/extended_pv_abs_desc_claims_full_chunks/"
TRAINING_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
VALIDATION_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
TEST_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

if not os.path.exists(preprocessed_location):
    os.makedirs(preprocessed_location)

In [12]:
NUM_ABSTRACT_PARTS = 3
NUM_DESC_PARTS = 23
NUM_CLAIMS_PARTS = 4

In [13]:
def multithreaded_extended_batch_creation(start_index):

    if os.path.exists(FILE_PREFIX + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        if doc_index > 9900:
            print doc_index, doc_id
            
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

#         abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
#         desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
#         claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])
        

        abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
        desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
        claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
        
        
        
        # lists of list of tokens
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        # now add the tokens lists that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        for i in range(NUM_ABSTRACT_PARTS):
            start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
            token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])
        
        for i in range(NUM_DESC_PARTS):
            start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
            token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    
        
        for i in range(NUM_CLAIMS_PARTS):
            start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
            token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])
            
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
        if doc_index >=9951:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines
    
def get_doc_range(i, number_of_tokens, number_of_parts):
    start, end = 0,0
    if number_of_tokens < number_of_parts:
        if i==0:
            return 0, None
        else:
            return number_of_tokens,None
    if i == 0:
        start = 0
    else:
        start = (number_of_tokens / number_of_parts) * i
    if i+1 == number_of_parts:
        end = None
    else:
        end = (number_of_tokens / number_of_parts) * (i+1)
    return start, end

In [14]:
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))
        print "finished writing batch %d" % batch_start

## Training

In [15]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [21]:
batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE)

In [22]:
batches

[0,
 10000,
 20000,
 30000,
 40000,
 50000,
 60000,
 70000,
 80000,
 90000,
 100000,
 110000,
 120000,
 130000,
 140000,
 150000,
 160000,
 170000,
 180000,
 190000,
 200000,
 210000,
 220000,
 230000,
 240000,
 250000]

In [23]:
try:
    pool = ThreadPool(9)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-22 00:57:06,749 : INFO : Batch creation working on 20000

2017-03-22 00:57:06,752 : INFO : Batch creation working on 40000

2017-03-22 00:57:06,750 : INFO : Batch creation working on 30000

2017-03-22 00:57:06,753 : INFO : Batch creation working on 50000

2017-03-22 00:57:06,749 : INFO : Batch creation working on 10000

2017-03-22 00:57:06,754 : INFO : Batch creation working on 80000

2017-03-22 00:57:06,753 : INFO : Batch creation working on 60000

2017-03-22 00:57:06,749 : INFO : Batch creation working on 0

2017-03-22 00:57:06,754 : INFO : Batch creation working on 70000

2017-03-22 00:57:13,067 : INFO : Doc:      0 -> Total Lines to write:       34
2017-03-22 00:57:13,087 : INFO : Doc:  30000 -> Total Lines to write:       34
2017-03-22 00:57:13,101 : INFO : Doc:  40000 -> Total Lines to write:       34
2017-03-22 00:57:13,114 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-03-22 00:57:13,217 : INFO : Doc:  10000 -> Total Lines to write:       34
2017-03-22 00:5

writing batch 0


2017-03-22 01:34:39,276 : INFO : Doc:  69000 -> Total Lines to write:   306034
2017-03-22 01:34:55,892 : INFO : Doc:  89000 -> Total Lines to write:   306034
2017-03-22 01:35:05,018 : INFO : Batch creation working on 90000

2017-03-22 01:35:05,238 : INFO : Doc:  90000 -> Total Lines to write:       34
2017-03-22 01:36:18,382 : INFO : Doc:  79000 -> Total Lines to write:   306034
2017-03-22 01:36:58,889 : INFO : Finished batch 50000 of size 10000 in 39m 52s
2017-03-22 01:36:58,892 : INFO : For index 50000, the actual number of lines written is: 340000


writing batch 50000


2017-03-22 01:37:23,118 : INFO : Finished batch 30000 of size 10000 in 40m 16s
2017-03-22 01:37:23,121 : INFO : For index 30000, the actual number of lines written is: 340000


writing batch 30000


2017-03-22 01:37:25,757 : INFO : Finished batch 20000 of size 10000 in 40m 19s
2017-03-22 01:37:25,760 : INFO : For index 20000, the actual number of lines written is: 340000


writing batch 20000


2017-03-22 01:37:31,328 : INFO : Batch creation working on 100000

2017-03-22 01:37:31,795 : INFO : Doc: 100000 -> Total Lines to write:       34
2017-03-22 01:37:32,520 : INFO : Finished batch 40000 of size 10000 in 40m 26s
2017-03-22 01:37:32,539 : INFO : For index 40000, the actual number of lines written is: 340000


writing batch 40000


2017-03-22 01:38:07,037 : INFO : Finished batch 10000 of size 10000 in 41m 0s
2017-03-22 01:38:07,040 : INFO : For index 10000, the actual number of lines written is: 340000


writing batch 10000


2017-03-22 01:38:20,528 : INFO : Batch creation working on 110000

2017-03-22 01:38:20,528 : INFO : Batch creation working on 120000

2017-03-22 01:38:20,770 : INFO : Doc: 110000 -> Total Lines to write:       34
2017-03-22 01:38:20,772 : INFO : Doc: 120000 -> Total Lines to write:       34
2017-03-22 01:38:20,531 : INFO : Batch creation working on 130000

2017-03-22 01:38:21,470 : INFO : Doc: 130000 -> Total Lines to write:       34
2017-03-22 01:38:40,224 : INFO : Finished batch 80000 of size 10000 in 41m 33s
2017-03-22 01:38:40,227 : INFO : For index 80000, the actual number of lines written is: 340000


writing batch 80000


2017-03-22 01:38:59,486 : INFO : Finished batch 60000 of size 10000 in 41m 53s
2017-03-22 01:38:59,488 : INFO : For index 60000, the actual number of lines written is: 340000


writing batch 60000


2017-03-22 01:39:00,053 : INFO : Doc:  91000 -> Total Lines to write:    34034
2017-03-22 01:40:22,803 : INFO : Finished batch 70000 of size 10000 in 43m 16s
2017-03-22 01:40:22,849 : INFO : For index 70000, the actual number of lines written is: 340000


writing batch 70000


2017-03-22 01:40:23,286 : INFO : Batch creation working on 140000

2017-03-22 01:40:23,588 : INFO : Doc: 140000 -> Total Lines to write:       34
2017-03-22 01:40:46,576 : INFO : Batch creation working on 150000

2017-03-22 01:40:46,713 : INFO : Doc: 150000 -> Total Lines to write:       34
2017-03-22 01:40:46,576 : INFO : Batch creation working on 160000

2017-03-22 01:40:47,052 : INFO : Doc: 160000 -> Total Lines to write:       34
2017-03-22 01:40:58,678 : INFO : Batch creation working on 170000

2017-03-22 01:40:59,009 : INFO : Doc: 170000 -> Total Lines to write:       34
2017-03-22 01:41:24,933 : INFO : Doc: 101000 -> Total Lines to write:    34034
2017-03-22 01:41:52,876 : INFO : Doc: 131000 -> Total Lines to write:    34034
2017-03-22 01:41:55,147 : INFO : Doc: 121000 -> Total Lines to write:    34034
2017-03-22 01:42:10,684 : INFO : Doc: 111000 -> Total Lines to write:    34034
2017-03-22 01:42:50,911 : INFO : Doc:  92000 -> Total Lines to write:    68034
2017-03-22 01:44:31,6

writing batch 90000


2017-03-22 02:16:12,538 : INFO : Batch creation working on 180000

2017-03-22 02:16:13,186 : INFO : Doc: 180000 -> Total Lines to write:       34
2017-03-22 02:16:51,911 : INFO : Doc: 119000 -> Total Lines to write:   306034
2017-03-22 02:17:17,579 : INFO : Doc: 109000 -> Total Lines to write:   306034
2017-03-22 02:17:21,439 : INFO : Doc: 159000 -> Total Lines to write:   306034
2017-03-22 02:17:48,873 : INFO : Doc: 169000 -> Total Lines to write:   306034
2017-03-22 02:19:21,192 : INFO : Finished batch 130000 of size 10000 in 41m 0s
2017-03-22 02:19:21,196 : INFO : For index 130000, the actual number of lines written is: 340000


writing batch 130000


2017-03-22 02:19:29,659 : INFO : Doc: 179000 -> Total Lines to write:   306034
2017-03-22 02:19:42,564 : INFO : Finished batch 120000 of size 10000 in 41m 22s
2017-03-22 02:19:42,566 : INFO : For index 120000, the actual number of lines written is: 340000


writing batch 120000


2017-03-22 02:20:00,967 : INFO : Batch creation working on 190000

2017-03-22 02:20:01,191 : INFO : Doc: 190000 -> Total Lines to write:       34
2017-03-22 02:20:21,276 : INFO : Batch creation working on 200000

2017-03-22 02:20:21,621 : INFO : Doc: 200000 -> Total Lines to write:       34
2017-03-22 02:20:38,956 : INFO : Doc: 181000 -> Total Lines to write:    34034
2017-03-22 02:21:19,550 : INFO : Finished batch 110000 of size 10000 in 42m 59s
2017-03-22 02:21:19,566 : INFO : For index 110000, the actual number of lines written is: 340000


writing batch 110000


2017-03-22 02:21:31,650 : INFO : Finished batch 150000 of size 10000 in 40m 45s
2017-03-22 02:21:31,653 : INFO : For index 150000, the actual number of lines written is: 340000


writing batch 150000


2017-03-22 02:21:35,781 : INFO : Finished batch 100000 of size 10000 in 44m 4s
2017-03-22 02:21:35,783 : INFO : For index 100000, the actual number of lines written is: 340000


writing batch 100000


2017-03-22 02:22:03,009 : INFO : Batch creation working on 210000

2017-03-22 02:22:03,406 : INFO : Doc: 210000 -> Total Lines to write:       34
2017-03-22 02:22:12,410 : INFO : Batch creation working on 220000

2017-03-22 02:22:12,784 : INFO : Doc: 220000 -> Total Lines to write:       34
2017-03-22 02:22:18,052 : INFO : Batch creation working on 230000

2017-03-22 02:22:18,697 : INFO : Doc: 230000 -> Total Lines to write:       34
2017-03-22 02:22:34,255 : INFO : Finished batch 160000 of size 10000 in 41m 48s
2017-03-22 02:22:34,263 : INFO : For index 160000, the actual number of lines written is: 340000


writing batch 160000


2017-03-22 02:23:10,387 : INFO : Batch creation working on 240000

2017-03-22 02:23:10,620 : INFO : Doc: 240000 -> Total Lines to write:       34
2017-03-22 02:23:41,342 : INFO : Finished batch 170000 of size 10000 in 42m 43s
2017-03-22 02:23:41,344 : INFO : For index 170000, the actual number of lines written is: 340000


writing batch 170000


2017-03-22 02:24:13,706 : INFO : Doc: 191000 -> Total Lines to write:    34034
2017-03-22 02:24:17,772 : INFO : Batch creation working on 250000

2017-03-22 02:24:17,901 : INFO : Doc: 250000 -> Total Lines to write:       34
2017-03-22 02:24:49,179 : INFO : Doc: 182000 -> Total Lines to write:    68034
2017-03-22 02:25:01,448 : INFO : Doc: 201000 -> Total Lines to write:    34034
2017-03-22 02:26:18,664 : INFO : Doc: 231000 -> Total Lines to write:    34034
2017-03-22 02:26:18,683 : INFO : Doc: 211000 -> Total Lines to write:    34034
2017-03-22 02:26:24,282 : INFO : Doc: 221000 -> Total Lines to write:    34034
2017-03-22 02:27:22,697 : INFO : Doc: 241000 -> Total Lines to write:    34034
2017-03-22 02:28:07,126 : INFO : Doc: 251000 -> Total Lines to write:    34034
2017-03-22 02:28:52,947 : INFO : Doc: 192000 -> Total Lines to write:    68034
2017-03-22 02:29:12,838 : INFO : Doc: 202000 -> Total Lines to write:    68034
2017-03-22 02:29:19,431 : INFO : Doc: 183000 -> Total Lines to w

writing batch 250000


2017-03-22 02:41:35,353 : INFO : Doc: 205000 -> Total Lines to write:   170034
2017-03-22 02:41:41,281 : INFO : Doc: 195000 -> Total Lines to write:   170034
2017-03-22 02:41:49,932 : INFO : Doc: 186000 -> Total Lines to write:   204034
2017-03-22 02:42:42,483 : INFO : Doc: 215000 -> Total Lines to write:   170034
2017-03-22 02:42:44,377 : INFO : Doc: 145000 -> Total Lines to write:   170034
2017-03-22 02:42:51,146 : INFO : Doc: 225000 -> Total Lines to write:   170034
2017-03-22 02:42:57,285 : INFO : Doc: 235000 -> Total Lines to write:   170034
2017-03-22 02:43:56,025 : INFO : Doc: 245000 -> Total Lines to write:   170034
2017-03-22 02:45:18,821 : INFO : Doc: 196000 -> Total Lines to write:   204034
2017-03-22 02:45:21,321 : INFO : Doc: 206000 -> Total Lines to write:   204034
2017-03-22 02:45:24,851 : INFO : Doc: 187000 -> Total Lines to write:   238034
2017-03-22 02:46:18,287 : INFO : Doc: 216000 -> Total Lines to write:   204034
2017-03-22 02:46:32,974 : INFO : Doc: 226000 -> Tota

writing batch 180000


2017-03-22 02:57:48,805 : INFO : Doc: 229000 -> Total Lines to write:   306034
2017-03-22 02:58:06,970 : INFO : Doc: 219000 -> Total Lines to write:   306034
2017-03-22 02:58:14,372 : INFO : Doc: 149000 -> Total Lines to write:   306034
2017-03-22 02:58:41,933 : INFO : Doc: 239000 -> Total Lines to write:   306034
2017-03-22 03:00:35,314 : INFO : Doc: 249000 -> Total Lines to write:   306034
2017-03-22 03:00:40,809 : INFO : Finished batch 190000 of size 10000 in 40m 40s
2017-03-22 03:00:40,811 : INFO : For index 190000, the actual number of lines written is: 340000


writing batch 190000


2017-03-22 03:01:04,847 : INFO : Finished batch 200000 of size 10000 in 40m 43s
2017-03-22 03:01:04,850 : INFO : For index 200000, the actual number of lines written is: 340000


writing batch 200000


2017-03-22 03:01:45,252 : INFO : Finished batch 210000 of size 10000 in 39m 42s
2017-03-22 03:01:45,255 : INFO : For index 210000, the actual number of lines written is: 340000


writing batch 210000


2017-03-22 03:01:59,204 : INFO : Finished batch 140000 of size 10000 in 81m 36s
2017-03-22 03:01:59,207 : INFO : For index 140000, the actual number of lines written is: 340000


writing batch 140000


2017-03-22 03:02:17,960 : INFO : Finished batch 220000 of size 10000 in 40m 6s
2017-03-22 03:02:17,964 : INFO : For index 220000, the actual number of lines written is: 340000


writing batch 220000


2017-03-22 03:03:03,237 : INFO : Finished batch 230000 of size 10000 in 40m 45s
2017-03-22 03:03:03,240 : INFO : For index 230000, the actual number of lines written is: 340000


writing batch 230000


2017-03-22 03:04:34,720 : INFO : Finished batch 240000 of size 10000 in 41m 24s
2017-03-22 03:04:34,723 : INFO : For index 240000, the actual number of lines written is: 340000


writing batch 240000


In [25]:
multithreaded_extended_batch_creation(0)

2017-03-16 10:44:02,083 : INFO : Batch creation working on 0

2017-03-16 10:44:02,175 : INFO : Doc:      0 -> Total Lines to write:       78
2017-03-16 10:44:18,155 : INFO : Finished batch 0 of size 100 in 0m 16s
2017-03-16 10:44:18,160 : INFO : For index 0, the actual number of lines written is: 7800


writing batch 0


## Validation

In [15]:
DOCS_LIST = validation_docs_list
FILE_PREFIX = VALIDATION_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(validation_docs_list)

In [None]:
try:
    pool = ThreadPool(6)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-04-09 01:56:59,587 : INFO : Batch creation working on 60000

2017-04-09 01:56:59,588 : INFO : Batch creation working on 70000

2017-04-09 01:56:59,745 : INFO : Doc:  70000 -> Total Lines to write:       34
2017-04-09 01:56:59,761 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-04-09 01:58:45,475 : INFO : Doc:  61000 -> Total Lines to write:    34034
2017-04-09 01:58:51,049 : INFO : Doc:  71000 -> Total Lines to write:    34034
2017-04-09 02:00:32,805 : INFO : Doc:  62000 -> Total Lines to write:    68034
2017-04-09 02:00:52,080 : INFO : Doc:  72000 -> Total Lines to write:    68034
2017-04-09 02:02:37,266 : INFO : Doc:  63000 -> Total Lines to write:   102034
2017-04-09 02:03:01,649 : INFO : Doc:  73000 -> Total Lines to write:   102034
2017-04-09 02:04:49,226 : INFO : Doc:  64000 -> Total Lines to write:   136034
2017-04-09 02:05:01,760 : INFO : Doc:  74000 -> Total Lines to write:   136034
2017-04-09 02:06:59,091 : INFO : Doc:  65000 -> Total Lines to write:   170034

writing batch 70000


Process PoolWorker-3:
Process PoolWorker-4:
Process PoolWorker-5:
Process PoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self

In [20]:
%%time
ff = get_patent(validation_docs_list[69952])

CPU times: user 1.03 s, sys: 524 ms, total: 1.55 s
Wall time: 12.8 s


In [22]:
patent_doc = ff
doc_id = validation_docs_list[69952]

In [17]:
validation_docs_list[69952]

u'08207316'

In [39]:
validation_docs_list[69953]

u'08002075'

In [62]:
token_lines = []

In [63]:
doc_id = validation_docs_list[69951]
patent_doc = get_patent(doc_id)
print 'Downloaded Document'

# Abstract
abstract = patent_doc['abstract'][0]
root = ET.fromstring(abstract.encode('utf-8'))
abs_paragraphs = get_adjusted_paragraphs(root)
info('Finished abstract')

# Description
desc = patent_doc['description'][0]
root = ET.fromstring(desc.encode('utf-8'))
desc_paragraphs = get_adjusted_paragraphs(root)
info('Finished description')

# Claims
claims = patent_doc['claims'][0]
root = ET.fromstring(claims.encode('utf-8'))
claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
info('Finished claims')


abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
info('Finished sentences')



# lists of list of tokens
doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens

# now add the tokens lists that will be written to the file
token_lines.append(doc_tokens_list)
token_lines.append(abstract_tokens_list)
token_lines.append(description_tokens_list)
token_lines.append(claims_tokens_list)

for i in range(NUM_ABSTRACT_PARTS):
    start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
    token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])

for i in range(NUM_DESC_PARTS):
    start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
    token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    

for i in range(NUM_CLAIMS_PARTS):
    start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
    token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])


2017-04-10 00:59:34,495 : INFO : Finished abstract
2017-04-10 00:59:34,571 : INFO : Finished description
2017-04-10 00:59:34,577 : INFO : Finished claims


Downloaded Document


2017-04-10 00:59:34,802 : INFO : Finished sentences


In [64]:
doc_id = validation_docs_list[69952]
patent_doc = get_patent(doc_id)
print 'Downloaded Document'

# Abstract
abstract = patent_doc['abstract'][0]
root = ET.fromstring(abstract.encode('utf-8'))
abs_paragraphs = get_adjusted_paragraphs(root)
info('Finished abstract')

# Description
desc = patent_doc['description'][0]
root = ET.fromstring(desc.encode('utf-8'))
desc_paragraphs = get_adjusted_paragraphs(root)
info('Finished description')

# Claims
claims = patent_doc['claims'][0]
root = ET.fromstring(claims.encode('utf-8'))
claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
info('Finished claims')


abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
# desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
info('Finished sentences')



# # lists of list of tokens
# doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
# abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
# description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
# claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens

# # now add the tokens lists that will be written to the file
# token_lines.append(doc_tokens_list)
# token_lines.append(abstract_tokens_list)
# token_lines.append(description_tokens_list)
# token_lines.append(claims_tokens_list)

# for i in range(NUM_ABSTRACT_PARTS):
#     start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
#     token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])

# for i in range(NUM_DESC_PARTS):
#     start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
#     token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    

# for i in range(NUM_CLAIMS_PARTS):
#     start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
#     token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])


2017-04-10 00:59:56,967 : INFO : Finished abstract


Downloaded Document


2017-04-10 01:00:43,768 : INFO : Finished description
2017-04-10 01:00:43,770 : INFO : Finished claims
2017-04-10 01:00:43,775 : INFO : Finished sentences


In [65]:
desc_tokens = pickle.load(open(root_location + 'desc_tokens.pkl', 'r'))

In [66]:


# lists of list of tokens
doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens

# now add the tokens lists that will be written to the file
token_lines.append(doc_tokens_list)
token_lines.append(abstract_tokens_list)
token_lines.append(description_tokens_list)
token_lines.append(claims_tokens_list)

for i in range(NUM_ABSTRACT_PARTS):
    start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
    token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])

for i in range(NUM_DESC_PARTS):
    start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
    token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    

for i in range(NUM_CLAIMS_PARTS):
    start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
    token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])


In [67]:
len(token_lines)

68

In [58]:
len(claims_tokens)

863

In [28]:
pickle.dump(desc_tokens, open(root_location + 'desc_tokens.pkl', 'w'))

In [None]:


# lists of list of tokens
doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens

# now add the tokens lists that will be written to the file
token_lines.append(doc_tokens_list)
token_lines.append(abstract_tokens_list)
token_lines.append(description_tokens_list)
token_lines.append(claims_tokens_list)

for i in range(NUM_ABSTRACT_PARTS):
    start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
    token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])

for i in range(NUM_DESC_PARTS):
    start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
    token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    

for i in range(NUM_CLAIMS_PARTS):
    start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
    token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])


In [35]:
len(token_lines)

1598

In [37]:
token_lines[0][:10]

[u'08002075',
 u'a',
 u'steering',
 u'device',
 u'for',
 u'a',
 u'superposition',
 u'steering',
 u'system',
 u'is']

In [20]:
multithreaded_extended_batch_creation(60000)

2017-04-09 15:46:24,554 : INFO : Batch creation working on 60000

2017-04-09 15:46:24,781 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-04-09 15:50:50,363 : INFO : Doc:  61000 -> Total Lines to write:    34034
2017-04-09 15:53:49,752 : INFO : Doc:  62000 -> Total Lines to write:    68034
2017-04-09 15:57:19,548 : INFO : Doc:  63000 -> Total Lines to write:   102034
2017-04-09 16:01:08,497 : INFO : Doc:  64000 -> Total Lines to write:   136034
2017-04-09 16:05:12,110 : INFO : Doc:  65000 -> Total Lines to write:   170034
2017-04-09 16:08:59,667 : INFO : Doc:  66000 -> Total Lines to write:   204034
2017-04-09 16:13:06,318 : INFO : Doc:  67000 -> Total Lines to write:   238034
2017-04-09 16:17:09,561 : INFO : Doc:  68000 -> Total Lines to write:   272034
2017-04-09 16:20:55,102 : INFO : Doc:  69000 -> Total Lines to write:   306034


9901 08124989
9902 07256083
9903 07156768
9904 07256081
9905 08077101
9906 08947185
9907 08344608
9908 08135716
9909 08832241
9910 07868387
9911 07765739
9912 08201026
9913 07157419
9914 08375068
9915 08077671
9916 08220098
9917 08220097
9918 08220096
9919 07410901
9920 07689870
9921 07765734
9922 07157414
9923 07004038
9924 07299058
9925 08862622
9926 07526627
9927 08605647
9928 08690261
9929 08937442
9930 08908705
9931 07871290
9932 08220331
9933 07668057
9934 08800629
9935 08102248
9936 07177754
9937 07177755
9938 08791969
9939 07141621
9940 07374676
9941 08791966
9942 07374675
9943 07374672
9944 08791962
9945 07548519
9946 08508349
9947 08488117
9948 08212152
9949 08723448
9950 08207318
9951 07612796


2017-04-09 16:24:28,111 : INFO : Finished batch 60000 of size 10000 in 38m 4s
2017-04-09 16:24:28,113 : INFO : For index 60000, the actual number of lines written is: 338368


writing batch 60000
finished writing batch 60000


In [54]:
validation_docs_list[69950: 69953]

[u'08207318', u'07612796', u'08207316']

In [51]:
validation_docs_list[69952]

u'08207316'

In [68]:
start_index = 69953
info("Batch creation working on {}\n".format(start_index))
start_time = time.time()

for doc_index, doc_id in enumerate(DOCS_LIST[start_index:70000]):
    print doc_index, doc_id

    patent_doc = get_patent(doc_id)

    # Abstract
    abstract = patent_doc['abstract'][0]
    root = ET.fromstring(abstract.encode('utf-8'))
    abs_paragraphs = get_adjusted_paragraphs(root)

    # Description
    desc = patent_doc['description'][0]
    root = ET.fromstring(desc.encode('utf-8'))
    desc_paragraphs = get_adjusted_paragraphs(root)

    # Claims
    claims = patent_doc['claims'][0]
    root = ET.fromstring(claims.encode('utf-8'))
    claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

#         abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
#         desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
#         claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])


    abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
    desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
    claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])


    # lists of list of tokens
    doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
    abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
    description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
    claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens

    # now add the tokens lists that will be written to the file
    token_lines.append(doc_tokens_list)
    token_lines.append(abstract_tokens_list)
    token_lines.append(description_tokens_list)
    token_lines.append(claims_tokens_list)

    for i in range(NUM_ABSTRACT_PARTS):
        start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
        token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])

    for i in range(NUM_DESC_PARTS):
        start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
        token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    

    for i in range(NUM_CLAIMS_PARTS):
        start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
        token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])

duration = time.time() - start_time
info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))


2017-04-10 01:11:37,344 : INFO : Batch creation working on 69953



0 08002075
1 08457003
2 08811420
3 08235497
4 08872443
5 08036304
6 08274294
7 07612300
8 07672841
9 07226675
10 08920199
11 07265847
12 07051244
13 07993771
14 07600946
15 07051248
16 08121513
17 07890307
18 07644315
19 07444309
20 07644317
21 07826008
22 07877085
23 07472013
24 08771000
25 07444300
26 07444301
27 07956763
28 08276676
29 08465626
30 07354709
31 08903939
32 08171054
33 08294036
34 07400332
35 08033773
36 07348728
37 07460605
38 07786687
39 07052574
40 07194614
41 07167804
42 07052577
43 08391069
44 08085626
45 07052572


2017-04-10 01:11:48,342 : INFO : Finished batch 69953 of size 10000 in 0m 11s
2017-04-10 01:11:48,344 : INFO : For index 69953, the actual number of lines written is: 1666


46 07391325


In [69]:
len(token_lines)

1666

In [70]:
with open(VALIDATION_PREPROCESSED_FILES_PREFIX + str(60000), 'a+') as batch_file:
    for line in token_lines:
        batch_file.write((u" ".join(line) + "\n").encode('utf-8'))

## Test

In [15]:
DOCS_LIST = test_docs_list
FILE_PREFIX = TEST_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(test_docs_list)

In [16]:
try:
    pool = ThreadPool(8)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-04-07 00:57:25,762 : INFO : Batch creation working on 40000

2017-04-07 00:57:25,761 : INFO : Batch creation working on 20000

2017-04-07 00:57:25,761 : INFO : Batch creation working on 0

2017-04-07 00:57:25,762 : INFO : Batch creation working on 100000

2017-04-07 00:57:25,763 : INFO : Batch creation working on 60000

2017-04-07 00:57:25,762 : INFO : Batch creation working on 80000

2017-04-07 00:57:25,763 : INFO : Batch creation working on 120000

2017-04-07 00:57:25,763 : INFO : Batch creation working on 140000

2017-04-07 00:57:26,605 : INFO : Doc: 100000 -> Total Lines to write:       34
2017-04-07 00:57:26,662 : INFO : Doc:  40000 -> Total Lines to write:       34
2017-04-07 00:57:26,678 : INFO : Doc:  80000 -> Total Lines to write:       34
2017-04-07 00:57:26,718 : INFO : Doc:  20000 -> Total Lines to write:       34
2017-04-07 00:57:26,841 : INFO : Doc: 120000 -> Total Lines to write:       34
2017-04-07 00:57:26,890 : INFO : Doc:      0 -> Total Lines to write:       34

writing batch 20000


2017-04-07 01:39:58,210 : INFO : Batch creation working on 30000

2017-04-07 01:39:58,586 : INFO : Doc:  30000 -> Total Lines to write:       34
2017-04-07 01:40:34,885 : INFO : Finished batch 80000 of size 10000 in 43m 9s
2017-04-07 01:40:34,887 : INFO : For index 80000, the actual number of lines written is: 340000


writing batch 80000


2017-04-07 01:40:59,764 : INFO : Finished batch 100000 of size 10000 in 43m 34s
2017-04-07 01:40:59,767 : INFO : For index 100000, the actual number of lines written is: 340000


writing batch 100000


2017-04-07 01:41:05,608 : INFO : Finished batch 0 of size 10000 in 43m 40s
2017-04-07 01:41:05,635 : INFO : For index 0, the actual number of lines written is: 340000


writing batch 0


2017-04-07 01:41:07,715 : INFO : Batch creation working on 90000

2017-04-07 01:41:08,194 : INFO : Doc:  90000 -> Total Lines to write:       34
2017-04-07 01:41:33,138 : INFO : Batch creation working on 110000

2017-04-07 01:41:33,608 : INFO : Doc: 110000 -> Total Lines to write:       34
2017-04-07 01:41:35,287 : INFO : Finished batch 140000 of size 10000 in 44m 9s
2017-04-07 01:41:35,290 : INFO : For index 140000, the actual number of lines written is: 340000


writing batch 140000


2017-04-07 01:41:36,831 : INFO : Batch creation working on 10000

2017-04-07 01:41:37,539 : INFO : Doc:  10000 -> Total Lines to write:       34
2017-04-07 01:42:01,777 : INFO : Finished batch 120000 of size 10000 in 44m 36s
2017-04-07 01:42:01,779 : INFO : For index 120000, the actual number of lines written is: 340000


writing batch 120000


2017-04-07 01:42:09,070 : INFO : Batch creation working on 150000

2017-04-07 01:42:09,717 : INFO : Doc: 150000 -> Total Lines to write:       34
2017-04-07 01:42:34,234 : INFO : Finished batch 60000 of size 10000 in 45m 8s
2017-04-07 01:42:34,237 : INFO : For index 60000, the actual number of lines written is: 340000


writing batch 60000


2017-04-07 01:42:38,368 : INFO : Batch creation working on 130000

2017-04-07 01:42:38,833 : INFO : Doc: 130000 -> Total Lines to write:       34
2017-04-07 01:42:48,055 : INFO : Finished batch 40000 of size 10000 in 45m 22s
2017-04-07 01:42:48,059 : INFO : For index 40000, the actual number of lines written is: 340000


writing batch 40000


2017-04-07 01:43:08,671 : INFO : Batch creation working on 70000

2017-04-07 01:43:09,211 : INFO : Doc:  70000 -> Total Lines to write:       34
2017-04-07 01:43:20,496 : INFO : Batch creation working on 50000

2017-04-07 01:43:20,977 : INFO : Doc:  50000 -> Total Lines to write:       34
2017-04-07 01:44:07,528 : INFO : Doc:  31000 -> Total Lines to write:    34034
2017-04-07 01:45:30,103 : INFO : Doc:  11000 -> Total Lines to write:    34034
2017-04-07 01:45:30,508 : INFO : Doc:  91000 -> Total Lines to write:    34034
2017-04-07 01:45:50,087 : INFO : Doc: 111000 -> Total Lines to write:    34034
2017-04-07 01:46:21,028 : INFO : Doc: 151000 -> Total Lines to write:    34034
2017-04-07 01:46:59,838 : INFO : Doc: 131000 -> Total Lines to write:    34034
2017-04-07 01:47:15,799 : INFO : Doc:  71000 -> Total Lines to write:    34034
2017-04-07 01:47:18,953 : INFO : Doc:  51000 -> Total Lines to write:    34034
2017-04-07 01:48:10,830 : INFO : Doc:  32000 -> Total Lines to write:    68034

writing batch 30000


2017-04-07 02:22:13,403 : INFO : Doc:  79000 -> Total Lines to write:   306034
2017-04-07 02:22:19,999 : INFO : Finished batch 90000 of size 10000 in 41m 12s
2017-04-07 02:22:20,016 : INFO : For index 90000, the actual number of lines written is: 340000


writing batch 90000


2017-04-07 02:22:29,854 : INFO : Batch creation working on 160000

2017-04-07 02:22:30,011 : INFO : Doc: 160000 -> Total Lines to write:       34
2017-04-07 02:22:53,426 : INFO : Batch creation working on 180000

2017-04-07 02:22:53,821 : INFO : Doc: 180000 -> Total Lines to write:       34
2017-04-07 02:23:06,833 : INFO : Finished batch 110000 of size 10000 in 41m 34s
2017-04-07 02:23:06,848 : INFO : For index 110000, the actual number of lines written is: 340000


writing batch 110000


2017-04-07 02:23:41,096 : INFO : Batch creation working on 200000

2017-04-07 02:23:41,934 : INFO : Doc: 200000 -> Total Lines to write:       34
2017-04-07 02:23:56,983 : INFO : Finished batch 10000 of size 10000 in 42m 20s
2017-04-07 02:23:56,988 : INFO : For index 10000, the actual number of lines written is: 340000


writing batch 10000


2017-04-07 02:24:14,393 : INFO : Finished batch 150000 of size 10000 in 42m 5s
2017-04-07 02:24:14,403 : INFO : For index 150000, the actual number of lines written is: 340000


writing batch 150000


2017-04-07 02:24:34,305 : INFO : Batch creation working on 220000

2017-04-07 02:24:35,775 : INFO : Doc: 220000 -> Total Lines to write:       34
2017-04-07 02:24:46,794 : INFO : Batch creation working on 240000

2017-04-07 02:24:47,453 : INFO : Doc: 240000 -> Total Lines to write:       34
2017-04-07 02:25:14,859 : INFO : Finished batch 130000 of size 10000 in 42m 36s
2017-04-07 02:25:14,867 : INFO : For index 130000, the actual number of lines written is: 340000


writing batch 130000


2017-04-07 02:25:47,047 : INFO : Batch creation working on 260000

2017-04-07 02:25:47,285 : INFO : Doc: 260000 -> Total Lines to write:       34
2017-04-07 02:25:57,309 : INFO : Finished batch 50000 of size 10000 in 42m 37s
2017-04-07 02:25:57,312 : INFO : For index 50000, the actual number of lines written is: 340000


writing batch 50000


2017-04-07 02:26:28,635 : INFO : Batch creation working on 280000

2017-04-07 02:26:28,857 : INFO : Doc: 280000 -> Total Lines to write:       34
2017-04-07 02:26:34,392 : INFO : Finished batch 70000 of size 10000 in 43m 26s
2017-04-07 02:26:34,396 : INFO : For index 70000, the actual number of lines written is: 340000


writing batch 70000


2017-04-07 02:26:34,857 : INFO : Doc: 161000 -> Total Lines to write:    34034
2017-04-07 02:27:01,528 : INFO : Doc: 181000 -> Total Lines to write:    34034
2017-04-07 02:27:07,019 : INFO : Batch creation working on 300000

2017-04-07 02:27:07,820 : INFO : Doc: 300000 -> Total Lines to write:       34
2017-04-07 02:27:49,095 : INFO : Doc: 201000 -> Total Lines to write:    34034
2017-04-07 02:28:51,693 : INFO : Doc: 241000 -> Total Lines to write:    34034
2017-04-07 02:28:59,515 : INFO : Doc: 221000 -> Total Lines to write:    34034
2017-04-07 02:30:13,192 : INFO : Doc: 261000 -> Total Lines to write:    34034
2017-04-07 02:31:03,850 : INFO : Doc: 281000 -> Total Lines to write:    34034
2017-04-07 02:31:14,755 : INFO : Doc: 182000 -> Total Lines to write:    68034
2017-04-07 02:31:16,979 : INFO : Doc: 162000 -> Total Lines to write:    68034
2017-04-07 02:31:46,721 : INFO : Doc: 301000 -> Total Lines to write:    34034
2017-04-07 02:32:41,557 : INFO : Doc: 202000 -> Total Lines to w

writing batch 160000


2017-04-07 03:13:36,293 : INFO : Doc: 269000 -> Total Lines to write:   306034
2017-04-07 03:13:53,686 : INFO : Batch creation working on 170000

2017-04-07 03:13:54,019 : INFO : Doc: 170000 -> Total Lines to write:       34
2017-04-07 03:14:30,375 : INFO : Doc: 289000 -> Total Lines to write:   306034
2017-04-07 03:14:48,145 : INFO : Doc: 309000 -> Total Lines to write:   306034
2017-04-07 03:14:51,841 : INFO : Finished batch 200000 of size 10000 in 51m 10s
2017-04-07 03:14:51,843 : INFO : For index 200000, the actual number of lines written is: 340000


writing batch 200000


2017-04-07 03:15:26,132 : INFO : Batch creation working on 210000

2017-04-07 03:15:26,549 : INFO : Doc: 210000 -> Total Lines to write:       34
2017-04-07 03:15:37,970 : INFO : Finished batch 180000 of size 10000 in 52m 44s
2017-04-07 03:15:37,973 : INFO : For index 180000, the actual number of lines written is: 340000


writing batch 180000


2017-04-07 03:16:03,994 : INFO : Finished batch 240000 of size 10000 in 51m 17s
2017-04-07 03:16:04,011 : INFO : For index 240000, the actual number of lines written is: 340000


writing batch 240000


2017-04-07 03:16:11,503 : INFO : Batch creation working on 190000

2017-04-07 03:16:11,861 : INFO : Doc: 190000 -> Total Lines to write:       34
2017-04-07 03:16:34,515 : INFO : Batch creation working on 250000

2017-04-07 03:16:34,879 : INFO : Doc: 250000 -> Total Lines to write:       34
2017-04-07 03:16:48,549 : INFO : Finished batch 220000 of size 10000 in 52m 14s
2017-04-07 03:16:48,551 : INFO : For index 220000, the actual number of lines written is: 340000


writing batch 220000


2017-04-07 03:17:20,388 : INFO : Batch creation working on 230000

2017-04-07 03:17:20,487 : INFO : Doc: 230000 -> Total Lines to write:       34
2017-04-07 03:17:32,551 : INFO : Finished batch 260000 of size 10000 in 51m 45s
2017-04-07 03:17:32,560 : INFO : For index 260000, the actual number of lines written is: 340000


writing batch 260000


2017-04-07 03:17:36,298 : INFO : Doc: 171000 -> Total Lines to write:    34034
2017-04-07 03:18:03,297 : INFO : Batch creation working on 270000

2017-04-07 03:18:03,551 : INFO : Doc: 270000 -> Total Lines to write:       34
2017-04-07 03:18:33,032 : INFO : Finished batch 280000 of size 10000 in 52m 4s
2017-04-07 03:18:33,034 : INFO : For index 280000, the actual number of lines written is: 340000


writing batch 280000


2017-04-07 03:18:49,543 : INFO : Finished batch 300000 of size 10000 in 51m 42s
2017-04-07 03:18:49,546 : INFO : For index 300000, the actual number of lines written is: 340000


writing batch 300000


2017-04-07 03:19:05,146 : INFO : Batch creation working on 290000

2017-04-07 03:19:05,404 : INFO : Doc: 290000 -> Total Lines to write:       34
2017-04-07 03:19:13,716 : INFO : Doc: 211000 -> Total Lines to write:    34034
2017-04-07 03:19:20,602 : INFO : Batch creation working on 310000

2017-04-07 03:19:20,964 : INFO : Doc: 310000 -> Total Lines to write:       34
2017-04-07 03:19:53,907 : INFO : Doc: 191000 -> Total Lines to write:    34034
2017-04-07 03:20:14,543 : INFO : Doc: 251000 -> Total Lines to write:    34034
2017-04-07 03:21:27,010 : INFO : Doc: 231000 -> Total Lines to write:    34034
2017-04-07 03:21:32,054 : INFO : Doc: 172000 -> Total Lines to write:    68034
2017-04-07 03:22:13,187 : INFO : Doc: 271000 -> Total Lines to write:    34034
2017-04-07 03:23:04,029 : INFO : Doc: 291000 -> Total Lines to write:    34034
2017-04-07 03:23:12,808 : INFO : Doc: 212000 -> Total Lines to write:    68034
2017-04-07 03:23:32,393 : INFO : Doc: 311000 -> Total Lines to write:    340

writing batch 170000


2017-04-07 03:53:47,085 : INFO : Batch creation working on 320000

2017-04-07 03:53:47,432 : INFO : Doc: 320000 -> Total Lines to write:       34
2017-04-07 03:54:09,654 : INFO : Doc: 299000 -> Total Lines to write:   306034
2017-04-07 03:54:16,269 : INFO : Doc: 279000 -> Total Lines to write:   306034
2017-04-07 03:56:03,085 : INFO : Finished batch 250000 of size 10000 in 39m 28s
2017-04-07 03:56:03,089 : INFO : For index 250000, the actual number of lines written is: 340000


writing batch 250000


2017-04-07 03:56:17,588 : INFO : Doc: 319000 -> Total Lines to write:   306034
2017-04-07 03:56:25,203 : INFO : Finished batch 190000 of size 10000 in 40m 13s
2017-04-07 03:56:25,206 : INFO : For index 190000, the actual number of lines written is: 340000


writing batch 190000


2017-04-07 03:56:27,512 : INFO : Finished batch 230000 of size 10000 in 39m 7s


writing batch 230000


2017-04-07 03:56:27,514 : INFO : For index 230000, the actual number of lines written is: 340000
2017-04-07 03:56:33,535 : INFO : Batch creation working on 340000

2017-04-07 03:56:33,964 : INFO : Doc: 340000 -> Total Lines to write:       34
2017-04-07 03:56:57,600 : INFO : Batch creation working on 360000

2017-04-07 03:56:57,946 : INFO : Doc: 360000 -> Total Lines to write:       34
2017-04-07 03:56:59,103 : INFO : Batch creation working on 380000

2017-04-07 03:56:59,417 : INFO : Doc: 380000 -> Total Lines to write:       34
2017-04-07 03:57:27,024 : INFO : Finished batch 210000 of size 10000 in 42m 1s
2017-04-07 03:57:27,027 : INFO : For index 210000, the actual number of lines written is: 340000


writing batch 210000


2017-04-07 03:57:50,936 : INFO : Doc: 321000 -> Total Lines to write:    34034
2017-04-07 03:57:59,589 : INFO : Batch creation working on 400000

2017-04-07 03:57:59,868 : INFO : Doc: 400000 -> Total Lines to write:       34
2017-04-07 03:58:04,165 : INFO : Finished batch 290000 of size 10000 in 38m 59s
2017-04-07 03:58:04,168 : INFO : For index 290000, the actual number of lines written is: 340000


writing batch 290000


2017-04-07 03:58:25,136 : INFO : Finished batch 270000 of size 10000 in 40m 22s
2017-04-07 03:58:25,139 : INFO : For index 270000, the actual number of lines written is: 340000


writing batch 270000


2017-04-07 04:00:18,910 : INFO : Doc: 341000 -> Total Lines to write:    34034
2017-04-07 04:00:19,292 : INFO : Doc: 361000 -> Total Lines to write:    34034
2017-04-07 04:00:24,365 : INFO : Doc: 381000 -> Total Lines to write:    34034
2017-04-07 04:00:28,972 : INFO : Finished batch 310000 of size 10000 in 41m 8s
2017-04-07 04:00:28,975 : INFO : For index 310000, the actual number of lines written is: 340000


writing batch 310000


2017-04-07 04:01:31,298 : INFO : Doc: 401000 -> Total Lines to write:    34034
2017-04-07 04:01:31,635 : INFO : Doc: 322000 -> Total Lines to write:    68034
2017-04-07 04:03:34,462 : INFO : Doc: 362000 -> Total Lines to write:    68034
2017-04-07 04:03:44,179 : INFO : Doc: 342000 -> Total Lines to write:    68034
2017-04-07 04:03:46,638 : INFO : Doc: 382000 -> Total Lines to write:    68034
2017-04-07 04:04:32,052 : INFO : Finished batch 400000 of size 10000 in 6m 32s
2017-04-07 04:04:32,055 : INFO : For index 400000, the actual number of lines written is: 63818


writing batch 400000


2017-04-07 04:04:43,082 : INFO : Doc: 323000 -> Total Lines to write:   102034
2017-04-07 04:06:58,823 : INFO : Doc: 343000 -> Total Lines to write:   102034
2017-04-07 04:07:09,454 : INFO : Doc: 363000 -> Total Lines to write:   102034
2017-04-07 04:07:13,724 : INFO : Doc: 383000 -> Total Lines to write:   102034
2017-04-07 04:08:09,639 : INFO : Doc: 324000 -> Total Lines to write:   136034
2017-04-07 04:10:14,922 : INFO : Doc: 344000 -> Total Lines to write:   136034
2017-04-07 04:10:35,349 : INFO : Doc: 384000 -> Total Lines to write:   136034
2017-04-07 04:10:40,166 : INFO : Doc: 364000 -> Total Lines to write:   136034
2017-04-07 04:11:47,573 : INFO : Doc: 325000 -> Total Lines to write:   170034
2017-04-07 04:13:59,797 : INFO : Doc: 345000 -> Total Lines to write:   170034
2017-04-07 04:14:01,059 : INFO : Doc: 385000 -> Total Lines to write:   170034
2017-04-07 04:14:18,920 : INFO : Doc: 365000 -> Total Lines to write:   170034
2017-04-07 04:15:12,882 : INFO : Doc: 326000 -> Tota

writing batch 320000


2017-04-07 04:30:13,266 : INFO : Batch creation working on 330000

2017-04-07 04:30:13,611 : INFO : Doc: 330000 -> Total Lines to write:       34
2017-04-07 04:31:02,345 : INFO : Finished batch 340000 of size 10000 in 34m 29s
2017-04-07 04:31:02,347 : INFO : For index 340000, the actual number of lines written is: 340000


writing batch 340000


2017-04-07 04:31:33,443 : INFO : Batch creation working on 350000

2017-04-07 04:31:33,815 : INFO : Doc: 350000 -> Total Lines to write:       34
2017-04-07 04:31:36,600 : INFO : Finished batch 380000 of size 10000 in 34m 37s
2017-04-07 04:31:36,604 : INFO : For index 380000, the actual number of lines written is: 340000


writing batch 380000


2017-04-07 04:31:46,666 : INFO : Finished batch 360000 of size 10000 in 34m 49s
2017-04-07 04:31:46,668 : INFO : For index 360000, the actual number of lines written is: 340000


writing batch 360000


2017-04-07 04:32:05,648 : INFO : Batch creation working on 390000

2017-04-07 04:32:05,953 : INFO : Doc: 390000 -> Total Lines to write:       34
2017-04-07 04:32:18,109 : INFO : Batch creation working on 370000

2017-04-07 04:32:18,388 : INFO : Doc: 370000 -> Total Lines to write:       34
2017-04-07 04:33:37,345 : INFO : Doc: 331000 -> Total Lines to write:    34034
2017-04-07 04:34:54,146 : INFO : Doc: 351000 -> Total Lines to write:    34034
2017-04-07 04:35:24,175 : INFO : Doc: 391000 -> Total Lines to write:    34034
2017-04-07 04:35:36,377 : INFO : Doc: 371000 -> Total Lines to write:    34034
2017-04-07 04:36:55,593 : INFO : Doc: 332000 -> Total Lines to write:    68034
2017-04-07 04:38:03,813 : INFO : Doc: 352000 -> Total Lines to write:    68034
2017-04-07 04:38:51,530 : INFO : Doc: 392000 -> Total Lines to write:    68034
2017-04-07 04:39:06,201 : INFO : Doc: 372000 -> Total Lines to write:    68034
2017-04-07 04:40:00,912 : INFO : Doc: 333000 -> Total Lines to write:   1020

writing batch 330000


2017-04-07 05:05:36,881 : INFO : Finished batch 350000 of size 10000 in 34m 3s
2017-04-07 05:05:36,884 : INFO : For index 350000, the actual number of lines written is: 340000


writing batch 350000


2017-04-07 05:06:47,331 : INFO : Finished batch 390000 of size 10000 in 34m 41s
2017-04-07 05:06:47,334 : INFO : For index 390000, the actual number of lines written is: 340000


writing batch 390000


2017-04-07 05:07:54,237 : INFO : Finished batch 370000 of size 10000 in 35m 36s
2017-04-07 05:07:54,241 : INFO : For index 370000, the actual number of lines written is: 340000


writing batch 370000


## Stats

In [37]:
BATCH_SIZE = 1000

In [38]:
def multithreaded_extended_batch_creation(start_index):

#     if os.path.exists(FILE_PREFIX + str(start_index)):
#         info("Batch {} already exists, skipping..".format(start_index))
#         return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    len_abs_sentences = 0
    len_desc_sentences = 0
    len_desc_paragraphs = 0
    len_claims_sentences = 0
    
    len_abs_tokens = []
    len_desc_tokens = []
    len_claims_tokens = []
    
    
    len_desc_parag_tokens = []

    len_claims_paragraphs = []
    
    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

        len_claims_paragraphs.append(len(claims_paragraphs))
    
        abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
        desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
        claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])
        
        len_abs_sentences += len(abstract_sentences)
        len_desc_sentences += len(desc_sentences)
        len_claims_sentences += len(claims_sentences)
        
        len_desc_paragraphs += len(desc_paragraphs)
        

        abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
        
        len_desc_parag_tokens.extend([len(sentence_wordtokenizer(parag)) for parag in desc_paragraphs])
        desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
        
        claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
        
        
        len_abs_tokens.append(len(abstract_tokens))
        len_desc_tokens.append(len(desc_tokens))
        len_claims_tokens.append(len(claims_tokens))
        
        # lists of list of tokens
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        # now add the tokens lists that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    print "Average Abstract Sentences: {}".format(len_abs_sentences/doc_index)
    print "Average Desc Sentences: {}".format(len_desc_sentences/doc_index)
    print "Average Desc Paragraphs: {}".format(len_desc_paragraphs/doc_index)
    print "Average Claims Sentences: {}".format(len_claims_sentences/doc_index)
    
    
    print "Abstract Tokens: Mean: {} - Median: {}".format(np.mean(len_abs_tokens), np.median(len_abs_tokens))
    print "Description Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_tokens), np.median(len_desc_tokens))
    print "Claims Tokens: Mean: {} - Median: {}".format(np.mean(len_claims_tokens), np.median(len_claims_tokens))
    print "Description Paragraphs Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_parag_tokens), 
                                                                        np.median(len_desc_parag_tokens))
    
    print "Claims Paragraphs: Mean: {} - Median: {}".format(np.mean(len_claims_paragraphs), 
                                                                        np.median(len_claims_paragraphs))
#     write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines

In [39]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [40]:
try:
    pool = ThreadPool(8) # use just 6 because every batch requires a lot of memory
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE*10 )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-16 01:00:43,555 : INFO : Batch creation working on 0

2017-03-16 01:00:43,555 : INFO : Batch creation working on 10000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 40000

2017-03-16 01:00:43,555 : INFO : Batch creation working on 20000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 50000

2017-03-16 01:00:43,559 : INFO : Batch creation working on 30000

2017-03-16 01:00:43,591 : INFO : Batch creation working on 60000

2017-03-16 01:00:43,601 : INFO : Batch creation working on 70000

2017-03-16 01:00:43,690 : INFO : Doc:  50000 -> Total Lines to write:        4
2017-03-16 01:00:43,742 : INFO : Doc:  60000 -> Total Lines to write:        4
2017-03-16 01:00:43,767 : INFO : Doc:      0 -> Total Lines to write:        4
2017-03-16 01:00:43,806 : INFO : Doc:  40000 -> Total Lines to write:        4
2017-03-16 01:00:43,846 : INFO : Doc:  20000 -> Total Lines to write:        4
2017-03-16 01:00:43,904 : INFO : Doc:  10000 -> Total Lines to write:        4
20

Average Abstract Sentences: 3
Average Desc Sentences: 236
Average Desc Paragraphs: 56
Average Claims Sentences: 19
Abstract Tokens: Mean: 118.576 - Median: 114.0
Description Tokens: Mean: 7694.354 - Median: 5256.5
Claims Tokens: Mean: 1101.352 - Median: 828.0
Description Paragraphs Tokens: Mean: 137.372194747 - Median: 113.0
Claims Paragraphs: Mean: 19.411 - Median: 16.0


2017-03-16 01:05:31,749 : INFO : Batch creation working on 80000

2017-03-16 01:05:33,147 : INFO : Doc:  80000 -> Total Lines to write:        4
2017-03-16 01:05:49,041 : INFO : Finished batch 40000 of size 1000 in 5m 5s
2017-03-16 01:05:49,060 : INFO : For index 40000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 261
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 119.666 - Median: 119.0
Description Tokens: Mean: 8411.443 - Median: 6107.0
Claims Tokens: Mean: 1111.596 - Median: 944.0
Description Paragraphs Tokens: Mean: 137.076788944 - Median: 113.0
Claims Paragraphs: Mean: 17.226 - Median: 16.0


2017-03-16 01:05:49,886 : INFO : Batch creation working on 90000

2017-03-16 01:05:50,033 : INFO : Doc:  90000 -> Total Lines to write:        4
2017-03-16 01:05:56,244 : INFO : Finished batch 50000 of size 1000 in 5m 13s
2017-03-16 01:05:56,248 : INFO : For index 50000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 265
Average Desc Paragraphs: 62
Average Claims Sentences: 17
Abstract Tokens: Mean: 116.916 - Median: 118.0
Description Tokens: Mean: 8561.637 - Median: 5910.0
Claims Tokens: Mean: 1082.717 - Median: 892.0
Description Paragraphs Tokens: Mean: 137.32235713 - Median: 112.0
Claims Paragraphs: Mean: 17.019 - Median: 15.0


2017-03-16 01:05:57,120 : INFO : Batch creation working on 100000

2017-03-16 01:05:57,343 : INFO : Doc: 100000 -> Total Lines to write:        4
2017-03-16 01:05:58,003 : INFO : Finished batch 20000 of size 1000 in 5m 14s
2017-03-16 01:05:58,006 : INFO : For index 20000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 266
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.616 - Median: 121.0
Description Tokens: Mean: 8539.343 - Median: 5729.0
Claims Tokens: Mean: 1096.825 - Median: 894.0
Description Paragraphs Tokens: Mean: 138.749581607 - Median: 113.0
Claims Paragraphs: Mean: 17.846 - Median: 15.0


2017-03-16 01:05:59,193 : INFO : Batch creation working on 110000

2017-03-16 01:06:00,006 : INFO : Doc: 110000 -> Total Lines to write:        4
2017-03-16 01:06:07,053 : INFO : Finished batch 30000 of size 1000 in 5m 23s
2017-03-16 01:06:07,059 : INFO : For index 30000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 271
Average Desc Paragraphs: 64
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.392 - Median: 118.0
Description Tokens: Mean: 8803.666 - Median: 6171.5
Claims Tokens: Mean: 1099.404 - Median: 923.0
Description Paragraphs Tokens: Mean: 137.510012183 - Median: 113.0
Claims Paragraphs: Mean: 17.875 - Median: 16.0


2017-03-16 01:06:08,003 : INFO : Batch creation working on 120000

2017-03-16 01:06:08,152 : INFO : Doc: 120000 -> Total Lines to write:        4
2017-03-16 01:06:14,886 : INFO : Finished batch 10000 of size 1000 in 5m 31s
2017-03-16 01:06:14,891 : INFO : For index 10000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 272
Average Desc Paragraphs: 63
Average Claims Sentences: 19
Abstract Tokens: Mean: 117.434 - Median: 116.0
Description Tokens: Mean: 8971.263 - Median: 5658.5
Claims Tokens: Mean: 1131.289 - Median: 896.0
Description Paragraphs Tokens: Mean: 141.313113334 - Median: 114.0
Claims Paragraphs: Mean: 19.008 - Median: 16.0


2017-03-16 01:06:18,493 : INFO : Finished batch 70000 of size 1000 in 5m 35s
2017-03-16 01:06:18,497 : INFO : For index 70000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 282
Average Desc Paragraphs: 66
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.501 - Median: 117.0
Description Tokens: Mean: 9176.843 - Median: 6447.5
Claims Tokens: Mean: 1058.939 - Median: 924.5
Description Paragraphs Tokens: Mean: 137.616864615 - Median: 112.0
Claims Paragraphs: Mean: 16.587 - Median: 16.0


2017-03-16 01:06:41,087 : INFO : Finished batch 60000 of size 1000 in 5m 57s
2017-03-16 01:06:41,089 : INFO : For index 60000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 71
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.293 - Median: 118.0
Description Tokens: Mean: 9770.952 - Median: 6617.0
Claims Tokens: Mean: 1059.014 - Median: 906.5
Description Paragraphs Tokens: Mean: 137.735438399 - Median: 112.0
Claims Paragraphs: Mean: 16.633 - Median: 16.0


2017-03-16 01:07:02,979 : INFO : Finished batch 120000 of size 1000 in 0m 55s
2017-03-16 01:07:02,982 : INFO : For index 120000, the actual number of lines written is: 624


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 67
Average Claims Sentences: 34
Abstract Tokens: Mean: 123.487179487 - Median: 123.5
Description Tokens: Mean: 9056.32051282 - Median: 6444.0
Claims Tokens: Mean: 2129.06410256 - Median: 1735.0
Description Paragraphs Tokens: Mean: 135.207771079 - Median: 111.0
Claims Paragraphs: Mean: 34.3141025641 - Median: 30.0


2017-03-16 01:10:54,091 : INFO : Finished batch 80000 of size 1000 in 5m 22s
2017-03-16 01:10:54,093 : INFO : For index 80000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 277
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.978 - Median: 118.0
Description Tokens: Mean: 9035.3 - Median: 6557.0
Claims Tokens: Mean: 1052.833 - Median: 908.0
Description Paragraphs Tokens: Mean: 137.333373866 - Median: 113.0
Claims Paragraphs: Mean: 16.561 - Median: 16.0


2017-03-16 01:11:22,433 : INFO : Finished batch 90000 of size 1000 in 5m 33s
2017-03-16 01:11:22,436 : INFO : For index 90000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 276
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.039 - Median: 118.0
Description Tokens: Mean: 9215.904 - Median: 6593.5
Claims Tokens: Mean: 1066.299 - Median: 932.5
Description Paragraphs Tokens: Mean: 141.239908046 - Median: 116.0
Claims Paragraphs: Mean: 16.451 - Median: 16.0


2017-03-16 01:11:36,285 : INFO : Finished batch 100000 of size 1000 in 5m 39s
2017-03-16 01:11:36,288 : INFO : For index 100000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 287
Average Desc Paragraphs: 67
Average Claims Sentences: 16
Abstract Tokens: Mean: 113.675 - Median: 117.0
Description Tokens: Mean: 9544.564 - Median: 6973.5
Claims Tokens: Mean: 1108.746 - Median: 977.5
Description Paragraphs Tokens: Mean: 141.208486211 - Median: 116.0
Claims Paragraphs: Mean: 16.438 - Median: 17.0


2017-03-16 01:11:54,205 : INFO : Finished batch 110000 of size 1000 in 5m 55s
2017-03-16 01:11:54,208 : INFO : For index 110000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 311
Average Desc Paragraphs: 73
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.923 - Median: 118.0
Description Tokens: Mean: 10124.423 - Median: 7225.5
Claims Tokens: Mean: 1053.8 - Median: 936.5
Description Paragraphs Tokens: Mean: 138.761056974 - Median: 115.0
Claims Paragraphs: Mean: 16.433 - Median: 16.0
