In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
import os
import re
import urllib2

import numpy as np
import random
import time

import json

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

import xml.etree.ElementTree as ET

import nltk

from thesis.utils.text import get_sentences, sentence_wordtokenizer

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SAMPLE_RATIO = 0.15

In [4]:
# root_location = "/mnt/data2/shalaby/"
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"

# training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list_" + str(SAMPLE_RATIO) + ".pkl"

In [5]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 24.9 s, sys: 1.3 s, total: 26.2 s
Wall time: 26.1 s


In [6]:
len(training_docs_list)

254767

#### Extraction Utils 

In [7]:
#ES_URL = 'http://localhost:9200/patents/patent/{}'
ES_URL = 'http://yell.dbs.ifi.lmu.de:9200/patents/patent/{}'
HEADING_TAG = 'heading'
PARAGRAPH_TAG = 'p'
UL_TAG = 'ul'
LI_TAG = 'li'
OL_TAG = 'ol'
DESC_OF_DRAWINGS_TAG = 'description-of-drawings'
MIN_PARAGRAPH_LENGTH = 50

In [8]:
def merge_with_previous(curr_node_tag, previous_node_tag, previous_node_text):
    if curr_node_tag == PARAGRAPH_TAG and previous_node_tag == HEADING_TAG:
        return True
    if previous_node_text and len(previous_node_text) < MIN_PARAGRAPH_LENGTH:
        return True
    return False
    
def get_paragraphs(root):
    paragraphs = []
    previous_node_text = None
    previous_tag = None
    for child in root:
        node_text = None
        if child.tag != DESC_OF_DRAWINGS_TAG:
            node_text = get_node_text(child)
            if node_text.strip():
                if merge_with_previous(child.tag, previous_tag, previous_node_text) and len(paragraphs) > 0:
                    paragraphs[-1] += ' ' + node_text
                else:
                    paragraphs.append(node_text)
        else:
            node_text = extract_desc_of_drawings_paragraph(child)
            paragraphs.append(node_text)
            
        previous_tag = child.tag
        previous_node_text = node_text
    return paragraphs
    
def extract_desc_of_drawings_paragraph(node):
    previous_tag = None
    sentences = []
    for child in node:
        node_text = get_node_text(child)
        if child.tag == PARAGRAPH_TAG and previous_tag == HEADING_TAG:
            sentences[-1] += ' ' + node_text
        else:
            # a paragraph in drawings descriptions is treated as a sentence
            if child.tag == PARAGRAPH_TAG:
                node_text = apply_sentence_end(node_text)
            sentences.append(node_text)
        previous_tag = child.tag
    
    return ' '.join(sentences)

def apply_sentence_end(text):
    if text and text.strip():
        text = text.strip().strip(';.')
        text += '. '
    return text

def itertext_custom(self):
    tag = self.tag
    if not isinstance(tag, basestring) and tag is not None:
        return
    if self.text:
        if tag == LI_TAG:
            yield apply_sentence_end(self.text)
        else:
            yield self.text.replace('\n',' ')
    for e in self:
        for s in e.itertext_custom():
            yield s
        if e.tail:
            yield e.tail

ET.Element.itertext_custom = itertext_custom
# def get_node_text(node):
#     node_text = ''
#     for child in node:
#         # for ul tags, get li tags as sentences
#         if child.tag == UL_TAG:
#             li_sentences = [apply_sentence_end(get_node_text_iterative(c)) for c in child]
#             child_text = ' '.join(li_sentences)
#         else:
#             child_text = get_node_text_iterative(child)
#         node_text += child_text
#     return node_text
        
get_node_text = lambda node: ''.join(node.itertext_custom()).strip()

In [9]:
def conc_paragraphs(parag1, parag2):
    return parag1.strip('.') + '.' + ' ' + parag2

def concatenate_sentences_to_paragraphs(paragraphs):
    """
    for 1 sentence paragraphs, concatenate them to the next or previous paragraph depending on context
    """
    for i in range(len(paragraphs)):
        if i >= len((paragraphs)): break
        parag = paragraphs[i]
        sentences = get_sentences(parag)
        
        if len(sentences) == 1:
            prev_paragraph = paragraphs[i-1] if i-1 >= 0 else None
            next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None

            if (next_paragraph and len(get_sentences(next_paragraph)) == 1):
                # If a series of 1 sentence length paragraphs exist, conc all of them in one paragraph
                while True:
                    if next_paragraph and len(get_sentences(next_paragraph)) == 1:
                        parag = conc_paragraphs(parag, next_paragraph)
                        paragraphs[i] = parag
                        del paragraphs[i+1]

                        # reinitialize for loop
                        next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None
                    else:
                        break

            # otherwise, just concatenate the 1 sentence paragraph to the previous paragraph
            elif prev_paragraph:
#                 print '============== Found prev eligible paragraph'
                prev_paragraph = conc_paragraphs(prev_paragraph, parag)
                paragraphs[i-1] = prev_paragraph
                del paragraphs[i]

            # if this is the first paragraph, then just concatenate it with the next one
            elif next_paragraph:
                parag = conc_paragraphs(parag, next_paragraph)
                paragraphs[i] = parag
                del paragraphs[i+1]

def get_adjusted_paragraphs(root, conc_sentences=True):
    paragraphs = get_paragraphs(root)
    if conc_sentences:
        concatenate_sentences_to_paragraphs(paragraphs)
    return paragraphs

In [10]:
def get_patent(doc_id):
    url_to_fetch = ES_URL.format(doc_id)

    response = urllib2.urlopen(url_to_fetch)
    patent_content = response.read()

    patent_object = json.loads(patent_content)['_source']
    return patent_object

# Actual Extraction

In [11]:
ABSTRACT_ID = "{}_abstract"
DESC_ID = "{}_description"
CLAIMS_ID = "{}_claims"

ABSTRACT_PART_ID = "{}_abstract_part-{}"
DESC_PART_ID = "{}_description_part-{}"
CLAIMS_PART_ID = "{}_claims_part-{}"

In [12]:
BATCH_SIZE = 10000

preprocessed_location = "/home/local/shalaby/" + "preprocessed_data/extended_pv_abs_desc_claims_large_sample_chunks/"
TRAINING_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
VALIDATION_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
TEST_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

if not os.path.exists(preprocessed_location):
    os.makedirs(preprocessed_location)

In [13]:
NUM_ABSTRACT_PARTS = 3
NUM_DESC_PARTS = 23
NUM_CLAIMS_PARTS = 4

In [14]:
def multithreaded_extended_batch_creation(start_index):

    if os.path.exists(FILE_PREFIX + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    len_abs_sentences = 0
    len_desc_sentences = 0
    len_desc_paragraphs = 0
    len_claims_sentences = 0
    
    len_abs_tokens = []
    len_desc_tokens = []
    len_claims_tokens = []

    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

#         abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
#         desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
#         claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])
        

        abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
        desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
        claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
        
        
        
        # lists of list of tokens
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        # now add the tokens lists that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        for i in range(NUM_ABSTRACT_PARTS):
            start, end = get_doc_range(i, len(abstract_tokens), NUM_ABSTRACT_PARTS)
            token_lines.append([ABSTRACT_PART_ID.format(doc_id, i+1)] + abstract_tokens[start: end])
        
        for i in range(NUM_DESC_PARTS):
            start, end = get_doc_range(i, len(desc_tokens), NUM_DESC_PARTS)
            token_lines.append([DESC_PART_ID.format(doc_id, i+1)] + desc_tokens[start: end])    
        
        for i in range(NUM_CLAIMS_PARTS):
            start, end = get_doc_range(i, len(claims_tokens), NUM_CLAIMS_PARTS)
            token_lines.append([CLAIMS_PART_ID.format(doc_id, i+1)] + claims_tokens[start: end])
            
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines
    
def get_doc_range(i, number_of_tokens, number_of_parts):
    start, end = 0,0
    if number_of_tokens < number_of_parts:
        if i==0:
            return 0, None
        else:
            return number_of_tokens,None
    if i == 0:
        start = 0
    else:
        start = (number_of_tokens / number_of_parts) * i
    if i+1 == number_of_parts:
        end = None
    else:
        end = (number_of_tokens / number_of_parts) * (i+1)
    return start, end

In [15]:
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))

## Training

In [20]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [21]:
batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE)

In [22]:
batches

[0,
 10000,
 20000,
 30000,
 40000,
 50000,
 60000,
 70000,
 80000,
 90000,
 100000,
 110000,
 120000,
 130000,
 140000,
 150000,
 160000,
 170000,
 180000,
 190000,
 200000,
 210000,
 220000,
 230000,
 240000,
 250000]

In [23]:
try:
    pool = ThreadPool(9)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-22 00:57:06,749 : INFO : Batch creation working on 20000

2017-03-22 00:57:06,752 : INFO : Batch creation working on 40000

2017-03-22 00:57:06,750 : INFO : Batch creation working on 30000

2017-03-22 00:57:06,753 : INFO : Batch creation working on 50000

2017-03-22 00:57:06,749 : INFO : Batch creation working on 10000

2017-03-22 00:57:06,754 : INFO : Batch creation working on 80000

2017-03-22 00:57:06,753 : INFO : Batch creation working on 60000

2017-03-22 00:57:06,749 : INFO : Batch creation working on 0

2017-03-22 00:57:06,754 : INFO : Batch creation working on 70000

2017-03-22 00:57:13,067 : INFO : Doc:      0 -> Total Lines to write:       34
2017-03-22 00:57:13,087 : INFO : Doc:  30000 -> Total Lines to write:       34
2017-03-22 00:57:13,101 : INFO : Doc:  40000 -> Total Lines to write:       34
2017-03-22 00:57:13,114 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-03-22 00:57:13,217 : INFO : Doc:  10000 -> Total Lines to write:       34
2017-03-22 00:5

writing batch 0


2017-03-22 01:34:39,276 : INFO : Doc:  69000 -> Total Lines to write:   306034
2017-03-22 01:34:55,892 : INFO : Doc:  89000 -> Total Lines to write:   306034
2017-03-22 01:35:05,018 : INFO : Batch creation working on 90000

2017-03-22 01:35:05,238 : INFO : Doc:  90000 -> Total Lines to write:       34
2017-03-22 01:36:18,382 : INFO : Doc:  79000 -> Total Lines to write:   306034
2017-03-22 01:36:58,889 : INFO : Finished batch 50000 of size 10000 in 39m 52s
2017-03-22 01:36:58,892 : INFO : For index 50000, the actual number of lines written is: 340000


writing batch 50000


2017-03-22 01:37:23,118 : INFO : Finished batch 30000 of size 10000 in 40m 16s
2017-03-22 01:37:23,121 : INFO : For index 30000, the actual number of lines written is: 340000


writing batch 30000


2017-03-22 01:37:25,757 : INFO : Finished batch 20000 of size 10000 in 40m 19s
2017-03-22 01:37:25,760 : INFO : For index 20000, the actual number of lines written is: 340000


writing batch 20000


2017-03-22 01:37:31,328 : INFO : Batch creation working on 100000

2017-03-22 01:37:31,795 : INFO : Doc: 100000 -> Total Lines to write:       34
2017-03-22 01:37:32,520 : INFO : Finished batch 40000 of size 10000 in 40m 26s
2017-03-22 01:37:32,539 : INFO : For index 40000, the actual number of lines written is: 340000


writing batch 40000


2017-03-22 01:38:07,037 : INFO : Finished batch 10000 of size 10000 in 41m 0s
2017-03-22 01:38:07,040 : INFO : For index 10000, the actual number of lines written is: 340000


writing batch 10000


2017-03-22 01:38:20,528 : INFO : Batch creation working on 110000

2017-03-22 01:38:20,528 : INFO : Batch creation working on 120000

2017-03-22 01:38:20,770 : INFO : Doc: 110000 -> Total Lines to write:       34
2017-03-22 01:38:20,772 : INFO : Doc: 120000 -> Total Lines to write:       34
2017-03-22 01:38:20,531 : INFO : Batch creation working on 130000

2017-03-22 01:38:21,470 : INFO : Doc: 130000 -> Total Lines to write:       34
2017-03-22 01:38:40,224 : INFO : Finished batch 80000 of size 10000 in 41m 33s
2017-03-22 01:38:40,227 : INFO : For index 80000, the actual number of lines written is: 340000


writing batch 80000


2017-03-22 01:38:59,486 : INFO : Finished batch 60000 of size 10000 in 41m 53s
2017-03-22 01:38:59,488 : INFO : For index 60000, the actual number of lines written is: 340000


writing batch 60000


2017-03-22 01:39:00,053 : INFO : Doc:  91000 -> Total Lines to write:    34034
2017-03-22 01:40:22,803 : INFO : Finished batch 70000 of size 10000 in 43m 16s
2017-03-22 01:40:22,849 : INFO : For index 70000, the actual number of lines written is: 340000


writing batch 70000


2017-03-22 01:40:23,286 : INFO : Batch creation working on 140000

2017-03-22 01:40:23,588 : INFO : Doc: 140000 -> Total Lines to write:       34
2017-03-22 01:40:46,576 : INFO : Batch creation working on 150000

2017-03-22 01:40:46,713 : INFO : Doc: 150000 -> Total Lines to write:       34
2017-03-22 01:40:46,576 : INFO : Batch creation working on 160000

2017-03-22 01:40:47,052 : INFO : Doc: 160000 -> Total Lines to write:       34
2017-03-22 01:40:58,678 : INFO : Batch creation working on 170000

2017-03-22 01:40:59,009 : INFO : Doc: 170000 -> Total Lines to write:       34
2017-03-22 01:41:24,933 : INFO : Doc: 101000 -> Total Lines to write:    34034
2017-03-22 01:41:52,876 : INFO : Doc: 131000 -> Total Lines to write:    34034
2017-03-22 01:41:55,147 : INFO : Doc: 121000 -> Total Lines to write:    34034
2017-03-22 01:42:10,684 : INFO : Doc: 111000 -> Total Lines to write:    34034
2017-03-22 01:42:50,911 : INFO : Doc:  92000 -> Total Lines to write:    68034
2017-03-22 01:44:31,6

writing batch 90000


2017-03-22 02:16:12,538 : INFO : Batch creation working on 180000

2017-03-22 02:16:13,186 : INFO : Doc: 180000 -> Total Lines to write:       34
2017-03-22 02:16:51,911 : INFO : Doc: 119000 -> Total Lines to write:   306034
2017-03-22 02:17:17,579 : INFO : Doc: 109000 -> Total Lines to write:   306034
2017-03-22 02:17:21,439 : INFO : Doc: 159000 -> Total Lines to write:   306034
2017-03-22 02:17:48,873 : INFO : Doc: 169000 -> Total Lines to write:   306034
2017-03-22 02:19:21,192 : INFO : Finished batch 130000 of size 10000 in 41m 0s
2017-03-22 02:19:21,196 : INFO : For index 130000, the actual number of lines written is: 340000


writing batch 130000


2017-03-22 02:19:29,659 : INFO : Doc: 179000 -> Total Lines to write:   306034
2017-03-22 02:19:42,564 : INFO : Finished batch 120000 of size 10000 in 41m 22s
2017-03-22 02:19:42,566 : INFO : For index 120000, the actual number of lines written is: 340000


writing batch 120000


2017-03-22 02:20:00,967 : INFO : Batch creation working on 190000

2017-03-22 02:20:01,191 : INFO : Doc: 190000 -> Total Lines to write:       34
2017-03-22 02:20:21,276 : INFO : Batch creation working on 200000

2017-03-22 02:20:21,621 : INFO : Doc: 200000 -> Total Lines to write:       34
2017-03-22 02:20:38,956 : INFO : Doc: 181000 -> Total Lines to write:    34034
2017-03-22 02:21:19,550 : INFO : Finished batch 110000 of size 10000 in 42m 59s
2017-03-22 02:21:19,566 : INFO : For index 110000, the actual number of lines written is: 340000


writing batch 110000


2017-03-22 02:21:31,650 : INFO : Finished batch 150000 of size 10000 in 40m 45s
2017-03-22 02:21:31,653 : INFO : For index 150000, the actual number of lines written is: 340000


writing batch 150000


2017-03-22 02:21:35,781 : INFO : Finished batch 100000 of size 10000 in 44m 4s
2017-03-22 02:21:35,783 : INFO : For index 100000, the actual number of lines written is: 340000


writing batch 100000


2017-03-22 02:22:03,009 : INFO : Batch creation working on 210000

2017-03-22 02:22:03,406 : INFO : Doc: 210000 -> Total Lines to write:       34
2017-03-22 02:22:12,410 : INFO : Batch creation working on 220000

2017-03-22 02:22:12,784 : INFO : Doc: 220000 -> Total Lines to write:       34
2017-03-22 02:22:18,052 : INFO : Batch creation working on 230000

2017-03-22 02:22:18,697 : INFO : Doc: 230000 -> Total Lines to write:       34
2017-03-22 02:22:34,255 : INFO : Finished batch 160000 of size 10000 in 41m 48s
2017-03-22 02:22:34,263 : INFO : For index 160000, the actual number of lines written is: 340000


writing batch 160000


2017-03-22 02:23:10,387 : INFO : Batch creation working on 240000

2017-03-22 02:23:10,620 : INFO : Doc: 240000 -> Total Lines to write:       34
2017-03-22 02:23:41,342 : INFO : Finished batch 170000 of size 10000 in 42m 43s
2017-03-22 02:23:41,344 : INFO : For index 170000, the actual number of lines written is: 340000


writing batch 170000


2017-03-22 02:24:13,706 : INFO : Doc: 191000 -> Total Lines to write:    34034
2017-03-22 02:24:17,772 : INFO : Batch creation working on 250000

2017-03-22 02:24:17,901 : INFO : Doc: 250000 -> Total Lines to write:       34
2017-03-22 02:24:49,179 : INFO : Doc: 182000 -> Total Lines to write:    68034
2017-03-22 02:25:01,448 : INFO : Doc: 201000 -> Total Lines to write:    34034
2017-03-22 02:26:18,664 : INFO : Doc: 231000 -> Total Lines to write:    34034
2017-03-22 02:26:18,683 : INFO : Doc: 211000 -> Total Lines to write:    34034
2017-03-22 02:26:24,282 : INFO : Doc: 221000 -> Total Lines to write:    34034
2017-03-22 02:27:22,697 : INFO : Doc: 241000 -> Total Lines to write:    34034
2017-03-22 02:28:07,126 : INFO : Doc: 251000 -> Total Lines to write:    34034
2017-03-22 02:28:52,947 : INFO : Doc: 192000 -> Total Lines to write:    68034
2017-03-22 02:29:12,838 : INFO : Doc: 202000 -> Total Lines to write:    68034
2017-03-22 02:29:19,431 : INFO : Doc: 183000 -> Total Lines to w

writing batch 250000


2017-03-22 02:41:35,353 : INFO : Doc: 205000 -> Total Lines to write:   170034
2017-03-22 02:41:41,281 : INFO : Doc: 195000 -> Total Lines to write:   170034
2017-03-22 02:41:49,932 : INFO : Doc: 186000 -> Total Lines to write:   204034
2017-03-22 02:42:42,483 : INFO : Doc: 215000 -> Total Lines to write:   170034
2017-03-22 02:42:44,377 : INFO : Doc: 145000 -> Total Lines to write:   170034
2017-03-22 02:42:51,146 : INFO : Doc: 225000 -> Total Lines to write:   170034
2017-03-22 02:42:57,285 : INFO : Doc: 235000 -> Total Lines to write:   170034
2017-03-22 02:43:56,025 : INFO : Doc: 245000 -> Total Lines to write:   170034
2017-03-22 02:45:18,821 : INFO : Doc: 196000 -> Total Lines to write:   204034
2017-03-22 02:45:21,321 : INFO : Doc: 206000 -> Total Lines to write:   204034
2017-03-22 02:45:24,851 : INFO : Doc: 187000 -> Total Lines to write:   238034
2017-03-22 02:46:18,287 : INFO : Doc: 216000 -> Total Lines to write:   204034
2017-03-22 02:46:32,974 : INFO : Doc: 226000 -> Tota

writing batch 180000


2017-03-22 02:57:48,805 : INFO : Doc: 229000 -> Total Lines to write:   306034
2017-03-22 02:58:06,970 : INFO : Doc: 219000 -> Total Lines to write:   306034
2017-03-22 02:58:14,372 : INFO : Doc: 149000 -> Total Lines to write:   306034
2017-03-22 02:58:41,933 : INFO : Doc: 239000 -> Total Lines to write:   306034
2017-03-22 03:00:35,314 : INFO : Doc: 249000 -> Total Lines to write:   306034
2017-03-22 03:00:40,809 : INFO : Finished batch 190000 of size 10000 in 40m 40s
2017-03-22 03:00:40,811 : INFO : For index 190000, the actual number of lines written is: 340000


writing batch 190000


2017-03-22 03:01:04,847 : INFO : Finished batch 200000 of size 10000 in 40m 43s
2017-03-22 03:01:04,850 : INFO : For index 200000, the actual number of lines written is: 340000


writing batch 200000


2017-03-22 03:01:45,252 : INFO : Finished batch 210000 of size 10000 in 39m 42s
2017-03-22 03:01:45,255 : INFO : For index 210000, the actual number of lines written is: 340000


writing batch 210000


2017-03-22 03:01:59,204 : INFO : Finished batch 140000 of size 10000 in 81m 36s
2017-03-22 03:01:59,207 : INFO : For index 140000, the actual number of lines written is: 340000


writing batch 140000


2017-03-22 03:02:17,960 : INFO : Finished batch 220000 of size 10000 in 40m 6s
2017-03-22 03:02:17,964 : INFO : For index 220000, the actual number of lines written is: 340000


writing batch 220000


2017-03-22 03:03:03,237 : INFO : Finished batch 230000 of size 10000 in 40m 45s
2017-03-22 03:03:03,240 : INFO : For index 230000, the actual number of lines written is: 340000


writing batch 230000


2017-03-22 03:04:34,720 : INFO : Finished batch 240000 of size 10000 in 41m 24s
2017-03-22 03:04:34,723 : INFO : For index 240000, the actual number of lines written is: 340000


writing batch 240000


In [25]:
multithreaded_extended_batch_creation(0)

2017-03-16 10:44:02,083 : INFO : Batch creation working on 0

2017-03-16 10:44:02,175 : INFO : Doc:      0 -> Total Lines to write:       78
2017-03-16 10:44:18,155 : INFO : Finished batch 0 of size 100 in 0m 16s
2017-03-16 10:44:18,160 : INFO : For index 0, the actual number of lines written is: 7800


writing batch 0


## Validation

In [24]:
DOCS_LIST = validation_docs_list
FILE_PREFIX = VALIDATION_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(validation_docs_list)

In [25]:
try:
    pool = ThreadPool(8)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-22 03:05:13,003 : INFO : Batch creation working on 0

2017-03-22 03:05:13,003 : INFO : Batch creation working on 10000

2017-03-22 03:05:13,005 : INFO : Batch creation working on 30000

2017-03-22 03:05:13,004 : INFO : Batch creation working on 40000

2017-03-22 03:05:13,006 : INFO : Batch creation working on 50000

2017-03-22 03:05:13,005 : INFO : Batch creation working on 20000

2017-03-22 03:05:13,007 : INFO : Batch creation working on 60000

2017-03-22 03:05:13,133 : INFO : Doc:  10000 -> Total Lines to write:       34
2017-03-22 03:05:13,204 : INFO : Doc:      0 -> Total Lines to write:       34
2017-03-22 03:05:13,216 : INFO : Doc:  50000 -> Total Lines to write:       34
2017-03-22 03:05:13,228 : INFO : Doc:  30000 -> Total Lines to write:       34
2017-03-22 03:05:13,274 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-03-22 03:05:13,284 : INFO : Doc:  40000 -> Total Lines to write:       34
2017-03-22 03:05:13,357 : INFO : Doc:  20000 -> Total Lines to write

writing batch 60000


2017-03-22 03:07:58,696 : INFO : Doc:   1000 -> Total Lines to write:    34034
2017-03-22 03:08:08,130 : INFO : Doc:  11000 -> Total Lines to write:    34034
2017-03-22 03:08:23,847 : INFO : Doc:  41000 -> Total Lines to write:    34034
2017-03-22 03:08:31,979 : INFO : Doc:  31000 -> Total Lines to write:    34034
2017-03-22 03:08:34,890 : INFO : Doc:  21000 -> Total Lines to write:    34034
2017-03-22 03:08:46,368 : INFO : Doc:  51000 -> Total Lines to write:    34034
2017-03-22 03:10:57,926 : INFO : Doc:   2000 -> Total Lines to write:    68034
2017-03-22 03:11:03,000 : INFO : Doc:  12000 -> Total Lines to write:    68034
2017-03-22 03:11:39,653 : INFO : Doc:  32000 -> Total Lines to write:    68034
2017-03-22 03:11:41,248 : INFO : Doc:  42000 -> Total Lines to write:    68034
2017-03-22 03:11:50,615 : INFO : Doc:  22000 -> Total Lines to write:    68034
2017-03-22 03:11:59,586 : INFO : Doc:  52000 -> Total Lines to write:    68034
2017-03-22 03:14:03,612 : INFO : Doc:   3000 -> Tota

writing batch 0


2017-03-22 03:36:08,866 : INFO : Doc:  39000 -> Total Lines to write:   306034
2017-03-22 03:36:09,796 : INFO : Doc:  49000 -> Total Lines to write:   306034
2017-03-22 03:36:52,540 : INFO : Finished batch 10000 of size 10000 in 31m 40s
2017-03-22 03:36:52,543 : INFO : For index 10000, the actual number of lines written is: 340000


writing batch 10000


2017-03-22 03:37:05,098 : INFO : Doc:  59000 -> Total Lines to write:   306034
2017-03-22 03:38:23,020 : INFO : Finished batch 20000 of size 10000 in 33m 10s
2017-03-22 03:38:23,023 : INFO : For index 20000, the actual number of lines written is: 340000


writing batch 20000


2017-03-22 03:39:55,337 : INFO : Finished batch 40000 of size 10000 in 34m 42s
2017-03-22 03:39:55,339 : INFO : For index 40000, the actual number of lines written is: 340000


writing batch 40000


2017-03-22 03:40:00,576 : INFO : Finished batch 30000 of size 10000 in 34m 48s
2017-03-22 03:40:00,579 : INFO : For index 30000, the actual number of lines written is: 340000


writing batch 30000


2017-03-22 03:40:49,468 : INFO : Finished batch 50000 of size 10000 in 35m 36s
2017-03-22 03:40:49,471 : INFO : For index 50000, the actual number of lines written is: 340000


writing batch 50000


## Test

In [16]:
DOCS_LIST = test_docs_list
FILE_PREFIX = TEST_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(test_docs_list)

In [17]:
try:
    pool = ThreadPool(8)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-30 01:52:57,499 : INFO : Batch creation working on 10000

2017-03-30 01:52:57,500 : INFO : Batch creation working on 50000

2017-03-30 01:52:57,501 : INFO : Batch creation working on 40000

2017-03-30 01:52:57,500 : INFO : Batch creation working on 60000

2017-03-30 01:52:57,501 : INFO : Batch creation working on 70000

2017-03-30 01:52:57,500 : INFO : Batch creation working on 30000

2017-03-30 01:52:57,499 : INFO : Batch creation working on 0

2017-03-30 01:52:57,499 : INFO : Batch creation working on 20000

2017-03-30 01:53:05,477 : INFO : Doc:  60000 -> Total Lines to write:       34
2017-03-30 01:53:05,484 : INFO : Doc:  40000 -> Total Lines to write:       34
2017-03-30 01:53:05,514 : INFO : Doc:  10000 -> Total Lines to write:       34
2017-03-30 01:53:05,528 : INFO : Doc:      0 -> Total Lines to write:       34
2017-03-30 01:53:05,534 : INFO : Doc:  20000 -> Total Lines to write:       34
2017-03-30 01:53:05,561 : INFO : Doc:  50000 -> Total Lines to write:       34
20

writing batch 10000


2017-03-30 02:32:56,419 : INFO : Finished batch 0 of size 10000 in 39m 59s
2017-03-30 02:32:56,421 : INFO : For index 0, the actual number of lines written is: 340000


writing batch 0


2017-03-30 02:33:15,904 : INFO : Doc:  79000 -> Total Lines to write:   306034
2017-03-30 02:33:27,040 : INFO : Doc:  69000 -> Total Lines to write:   306034
2017-03-30 02:34:25,339 : INFO : Finished batch 50000 of size 10000 in 41m 28s
2017-03-30 02:34:25,345 : INFO : For index 50000, the actual number of lines written is: 340000


writing batch 50000


2017-03-30 02:34:37,595 : INFO : Finished batch 20000 of size 10000 in 41m 40s
2017-03-30 02:34:37,597 : INFO : For index 20000, the actual number of lines written is: 340000


writing batch 20000


2017-03-30 02:34:38,041 : INFO : Finished batch 40000 of size 10000 in 41m 41s
2017-03-30 02:34:38,044 : INFO : For index 40000, the actual number of lines written is: 340000


writing batch 40000


2017-03-30 02:35:33,765 : INFO : Finished batch 30000 of size 10000 in 42m 36s
2017-03-30 02:35:33,768 : INFO : For index 30000, the actual number of lines written is: 340000


writing batch 30000


2017-03-30 02:35:55,922 : INFO : Finished batch 70000 of size 10000 in 42m 58s
2017-03-30 02:35:55,925 : INFO : For index 70000, the actual number of lines written is: 332690


writing batch 70000


2017-03-30 02:37:12,736 : INFO : Finished batch 60000 of size 10000 in 44m 15s
2017-03-30 02:37:12,739 : INFO : For index 60000, the actual number of lines written is: 340000


writing batch 60000


## Stats

In [37]:
BATCH_SIZE = 1000

In [38]:
def multithreaded_extended_batch_creation(start_index):

#     if os.path.exists(FILE_PREFIX + str(start_index)):
#         info("Batch {} already exists, skipping..".format(start_index))
#         return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    len_abs_sentences = 0
    len_desc_sentences = 0
    len_desc_paragraphs = 0
    len_claims_sentences = 0
    
    len_abs_tokens = []
    len_desc_tokens = []
    len_claims_tokens = []
    
    
    len_desc_parag_tokens = []

    len_claims_paragraphs = []
    
    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

        len_claims_paragraphs.append(len(claims_paragraphs))
    
        abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
        desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
        claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])
        
        len_abs_sentences += len(abstract_sentences)
        len_desc_sentences += len(desc_sentences)
        len_claims_sentences += len(claims_sentences)
        
        len_desc_paragraphs += len(desc_paragraphs)
        

        abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
        
        len_desc_parag_tokens.extend([len(sentence_wordtokenizer(parag)) for parag in desc_paragraphs])
        desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
        
        claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
        
        
        len_abs_tokens.append(len(abstract_tokens))
        len_desc_tokens.append(len(desc_tokens))
        len_claims_tokens.append(len(claims_tokens))
        
        # lists of list of tokens
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        # now add the tokens lists that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    print "Average Abstract Sentences: {}".format(len_abs_sentences/doc_index)
    print "Average Desc Sentences: {}".format(len_desc_sentences/doc_index)
    print "Average Desc Paragraphs: {}".format(len_desc_paragraphs/doc_index)
    print "Average Claims Sentences: {}".format(len_claims_sentences/doc_index)
    
    
    print "Abstract Tokens: Mean: {} - Median: {}".format(np.mean(len_abs_tokens), np.median(len_abs_tokens))
    print "Description Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_tokens), np.median(len_desc_tokens))
    print "Claims Tokens: Mean: {} - Median: {}".format(np.mean(len_claims_tokens), np.median(len_claims_tokens))
    print "Description Paragraphs Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_parag_tokens), 
                                                                        np.median(len_desc_parag_tokens))
    
    print "Claims Paragraphs: Mean: {} - Median: {}".format(np.mean(len_claims_paragraphs), 
                                                                        np.median(len_claims_paragraphs))
#     write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines

In [39]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [40]:
try:
    pool = ThreadPool(8) # use just 6 because every batch requires a lot of memory
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE*10 )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-16 01:00:43,555 : INFO : Batch creation working on 0

2017-03-16 01:00:43,555 : INFO : Batch creation working on 10000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 40000

2017-03-16 01:00:43,555 : INFO : Batch creation working on 20000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 50000

2017-03-16 01:00:43,559 : INFO : Batch creation working on 30000

2017-03-16 01:00:43,591 : INFO : Batch creation working on 60000

2017-03-16 01:00:43,601 : INFO : Batch creation working on 70000

2017-03-16 01:00:43,690 : INFO : Doc:  50000 -> Total Lines to write:        4
2017-03-16 01:00:43,742 : INFO : Doc:  60000 -> Total Lines to write:        4
2017-03-16 01:00:43,767 : INFO : Doc:      0 -> Total Lines to write:        4
2017-03-16 01:00:43,806 : INFO : Doc:  40000 -> Total Lines to write:        4
2017-03-16 01:00:43,846 : INFO : Doc:  20000 -> Total Lines to write:        4
2017-03-16 01:00:43,904 : INFO : Doc:  10000 -> Total Lines to write:        4
20

Average Abstract Sentences: 3
Average Desc Sentences: 236
Average Desc Paragraphs: 56
Average Claims Sentences: 19
Abstract Tokens: Mean: 118.576 - Median: 114.0
Description Tokens: Mean: 7694.354 - Median: 5256.5
Claims Tokens: Mean: 1101.352 - Median: 828.0
Description Paragraphs Tokens: Mean: 137.372194747 - Median: 113.0
Claims Paragraphs: Mean: 19.411 - Median: 16.0


2017-03-16 01:05:31,749 : INFO : Batch creation working on 80000

2017-03-16 01:05:33,147 : INFO : Doc:  80000 -> Total Lines to write:        4
2017-03-16 01:05:49,041 : INFO : Finished batch 40000 of size 1000 in 5m 5s
2017-03-16 01:05:49,060 : INFO : For index 40000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 261
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 119.666 - Median: 119.0
Description Tokens: Mean: 8411.443 - Median: 6107.0
Claims Tokens: Mean: 1111.596 - Median: 944.0
Description Paragraphs Tokens: Mean: 137.076788944 - Median: 113.0
Claims Paragraphs: Mean: 17.226 - Median: 16.0


2017-03-16 01:05:49,886 : INFO : Batch creation working on 90000

2017-03-16 01:05:50,033 : INFO : Doc:  90000 -> Total Lines to write:        4
2017-03-16 01:05:56,244 : INFO : Finished batch 50000 of size 1000 in 5m 13s
2017-03-16 01:05:56,248 : INFO : For index 50000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 265
Average Desc Paragraphs: 62
Average Claims Sentences: 17
Abstract Tokens: Mean: 116.916 - Median: 118.0
Description Tokens: Mean: 8561.637 - Median: 5910.0
Claims Tokens: Mean: 1082.717 - Median: 892.0
Description Paragraphs Tokens: Mean: 137.32235713 - Median: 112.0
Claims Paragraphs: Mean: 17.019 - Median: 15.0


2017-03-16 01:05:57,120 : INFO : Batch creation working on 100000

2017-03-16 01:05:57,343 : INFO : Doc: 100000 -> Total Lines to write:        4
2017-03-16 01:05:58,003 : INFO : Finished batch 20000 of size 1000 in 5m 14s
2017-03-16 01:05:58,006 : INFO : For index 20000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 266
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.616 - Median: 121.0
Description Tokens: Mean: 8539.343 - Median: 5729.0
Claims Tokens: Mean: 1096.825 - Median: 894.0
Description Paragraphs Tokens: Mean: 138.749581607 - Median: 113.0
Claims Paragraphs: Mean: 17.846 - Median: 15.0


2017-03-16 01:05:59,193 : INFO : Batch creation working on 110000

2017-03-16 01:06:00,006 : INFO : Doc: 110000 -> Total Lines to write:        4
2017-03-16 01:06:07,053 : INFO : Finished batch 30000 of size 1000 in 5m 23s
2017-03-16 01:06:07,059 : INFO : For index 30000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 271
Average Desc Paragraphs: 64
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.392 - Median: 118.0
Description Tokens: Mean: 8803.666 - Median: 6171.5
Claims Tokens: Mean: 1099.404 - Median: 923.0
Description Paragraphs Tokens: Mean: 137.510012183 - Median: 113.0
Claims Paragraphs: Mean: 17.875 - Median: 16.0


2017-03-16 01:06:08,003 : INFO : Batch creation working on 120000

2017-03-16 01:06:08,152 : INFO : Doc: 120000 -> Total Lines to write:        4
2017-03-16 01:06:14,886 : INFO : Finished batch 10000 of size 1000 in 5m 31s
2017-03-16 01:06:14,891 : INFO : For index 10000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 272
Average Desc Paragraphs: 63
Average Claims Sentences: 19
Abstract Tokens: Mean: 117.434 - Median: 116.0
Description Tokens: Mean: 8971.263 - Median: 5658.5
Claims Tokens: Mean: 1131.289 - Median: 896.0
Description Paragraphs Tokens: Mean: 141.313113334 - Median: 114.0
Claims Paragraphs: Mean: 19.008 - Median: 16.0


2017-03-16 01:06:18,493 : INFO : Finished batch 70000 of size 1000 in 5m 35s
2017-03-16 01:06:18,497 : INFO : For index 70000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 282
Average Desc Paragraphs: 66
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.501 - Median: 117.0
Description Tokens: Mean: 9176.843 - Median: 6447.5
Claims Tokens: Mean: 1058.939 - Median: 924.5
Description Paragraphs Tokens: Mean: 137.616864615 - Median: 112.0
Claims Paragraphs: Mean: 16.587 - Median: 16.0


2017-03-16 01:06:41,087 : INFO : Finished batch 60000 of size 1000 in 5m 57s
2017-03-16 01:06:41,089 : INFO : For index 60000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 71
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.293 - Median: 118.0
Description Tokens: Mean: 9770.952 - Median: 6617.0
Claims Tokens: Mean: 1059.014 - Median: 906.5
Description Paragraphs Tokens: Mean: 137.735438399 - Median: 112.0
Claims Paragraphs: Mean: 16.633 - Median: 16.0


2017-03-16 01:07:02,979 : INFO : Finished batch 120000 of size 1000 in 0m 55s
2017-03-16 01:07:02,982 : INFO : For index 120000, the actual number of lines written is: 624


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 67
Average Claims Sentences: 34
Abstract Tokens: Mean: 123.487179487 - Median: 123.5
Description Tokens: Mean: 9056.32051282 - Median: 6444.0
Claims Tokens: Mean: 2129.06410256 - Median: 1735.0
Description Paragraphs Tokens: Mean: 135.207771079 - Median: 111.0
Claims Paragraphs: Mean: 34.3141025641 - Median: 30.0


2017-03-16 01:10:54,091 : INFO : Finished batch 80000 of size 1000 in 5m 22s
2017-03-16 01:10:54,093 : INFO : For index 80000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 277
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.978 - Median: 118.0
Description Tokens: Mean: 9035.3 - Median: 6557.0
Claims Tokens: Mean: 1052.833 - Median: 908.0
Description Paragraphs Tokens: Mean: 137.333373866 - Median: 113.0
Claims Paragraphs: Mean: 16.561 - Median: 16.0


2017-03-16 01:11:22,433 : INFO : Finished batch 90000 of size 1000 in 5m 33s
2017-03-16 01:11:22,436 : INFO : For index 90000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 276
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.039 - Median: 118.0
Description Tokens: Mean: 9215.904 - Median: 6593.5
Claims Tokens: Mean: 1066.299 - Median: 932.5
Description Paragraphs Tokens: Mean: 141.239908046 - Median: 116.0
Claims Paragraphs: Mean: 16.451 - Median: 16.0


2017-03-16 01:11:36,285 : INFO : Finished batch 100000 of size 1000 in 5m 39s
2017-03-16 01:11:36,288 : INFO : For index 100000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 287
Average Desc Paragraphs: 67
Average Claims Sentences: 16
Abstract Tokens: Mean: 113.675 - Median: 117.0
Description Tokens: Mean: 9544.564 - Median: 6973.5
Claims Tokens: Mean: 1108.746 - Median: 977.5
Description Paragraphs Tokens: Mean: 141.208486211 - Median: 116.0
Claims Paragraphs: Mean: 16.438 - Median: 17.0


2017-03-16 01:11:54,205 : INFO : Finished batch 110000 of size 1000 in 5m 55s
2017-03-16 01:11:54,208 : INFO : For index 110000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 311
Average Desc Paragraphs: 73
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.923 - Median: 118.0
Description Tokens: Mean: 10124.423 - Median: 7225.5
Claims Tokens: Mean: 1053.8 - Median: 936.5
Description Paragraphs Tokens: Mean: 138.761056974 - Median: 115.0
Claims Paragraphs: Mean: 16.433 - Median: 16.0
