In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
import os
import re
import urllib2

import numpy as np
import random
import time

import json

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

import xml.etree.ElementTree as ET

import nltk

from thesis.utils.text import get_sentences, sentence_wordtokenizer

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SAMPLE_RATIO = 0.15

In [4]:
# root_location = "/mnt/data2/shalaby/"
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"

# training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list_" + str(SAMPLE_RATIO) + ".pkl"

In [5]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 25.1 s, sys: 1.4 s, total: 26.5 s
Wall time: 26.5 s


In [6]:
len(training_docs_list)

254767

#### Extraction Utils 

In [7]:
#ES_URL = 'http://localhost:9200/patents/patent/{}'
ES_URL = 'http://yell.dbs.ifi.lmu.de:9200/patents/patent/{}'
HEADING_TAG = 'heading'
PARAGRAPH_TAG = 'p'
UL_TAG = 'ul'
LI_TAG = 'li'
OL_TAG = 'ol'
DESC_OF_DRAWINGS_TAG = 'description-of-drawings'
MIN_PARAGRAPH_LENGTH = 50

In [8]:
def merge_with_previous(curr_node_tag, previous_node_tag, previous_node_text):
    if curr_node_tag == PARAGRAPH_TAG and previous_node_tag == HEADING_TAG:
        return True
    if previous_node_text and len(previous_node_text) < MIN_PARAGRAPH_LENGTH:
        return True
    return False
    
def get_paragraphs(root):
    paragraphs = []
    previous_node_text = None
    previous_tag = None
    for child in root:
        node_text = None
        if child.tag != DESC_OF_DRAWINGS_TAG:
            node_text = get_node_text(child)
            if node_text.strip():
                if merge_with_previous(child.tag, previous_tag, previous_node_text) and len(paragraphs) > 0:
                    paragraphs[-1] += ' ' + node_text
                else:
                    paragraphs.append(node_text)
        else:
            node_text = extract_desc_of_drawings_paragraph(child)
            paragraphs.append(node_text)
            
        previous_tag = child.tag
        previous_node_text = node_text
    return paragraphs
    
def extract_desc_of_drawings_paragraph(node):
    previous_tag = None
    sentences = []
    for child in node:
        node_text = get_node_text(child)
        if child.tag == PARAGRAPH_TAG and previous_tag == HEADING_TAG:
            sentences[-1] += ' ' + node_text
        else:
            # a paragraph in drawings descriptions is treated as a sentence
            if child.tag == PARAGRAPH_TAG:
                node_text = apply_sentence_end(node_text)
            sentences.append(node_text)
        previous_tag = child.tag
    
    return ' '.join(sentences)

def apply_sentence_end(text):
    if text and text.strip():
        text = text.strip().strip(';.')
        text += '. '
    return text

def itertext_custom(self):
    tag = self.tag
    if not isinstance(tag, basestring) and tag is not None:
        return
    if self.text:
        if tag == LI_TAG:
            yield apply_sentence_end(self.text)
        else:
            yield self.text.replace('\n',' ')
    for e in self:
        for s in e.itertext_custom():
            yield s
        if e.tail:
            yield e.tail

ET.Element.itertext_custom = itertext_custom
# def get_node_text(node):
#     node_text = ''
#     for child in node:
#         # for ul tags, get li tags as sentences
#         if child.tag == UL_TAG:
#             li_sentences = [apply_sentence_end(get_node_text_iterative(c)) for c in child]
#             child_text = ' '.join(li_sentences)
#         else:
#             child_text = get_node_text_iterative(child)
#         node_text += child_text
#     return node_text
        
get_node_text = lambda node: ''.join(node.itertext_custom()).strip()

In [9]:
def conc_paragraphs(parag1, parag2):
    return parag1.strip('.') + '.' + ' ' + parag2

def concatenate_sentences_to_paragraphs(paragraphs):
    """
    for 1 sentence paragraphs, concatenate them to the next or previous paragraph depending on context
    """
    for i in range(len(paragraphs)):
        if i >= len((paragraphs)): break
        parag = paragraphs[i]
        sentences = get_sentences(parag)
        
        if len(sentences) == 1:
            prev_paragraph = paragraphs[i-1] if i-1 >= 0 else None
            next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None

            if (next_paragraph and len(get_sentences(next_paragraph)) == 1):
                # If a series of 1 sentence length paragraphs exist, conc all of them in one paragraph
                while True:
                    if next_paragraph and len(get_sentences(next_paragraph)) == 1:
                        parag = conc_paragraphs(parag, next_paragraph)
                        paragraphs[i] = parag
                        del paragraphs[i+1]

                        # reinitialize for loop
                        next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None
                    else:
                        break

            # otherwise, just concatenate the 1 sentence paragraph to the previous paragraph
            elif prev_paragraph:
#                 print '============== Found prev eligible paragraph'
                prev_paragraph = conc_paragraphs(prev_paragraph, parag)
                paragraphs[i-1] = prev_paragraph
                del paragraphs[i]

            # if this is the first paragraph, then just concatenate it with the next one
            elif next_paragraph:
                parag = conc_paragraphs(parag, next_paragraph)
                paragraphs[i] = parag
                del paragraphs[i+1]

def get_adjusted_paragraphs(root, conc_sentences=True):
    paragraphs = get_paragraphs(root)
    if conc_sentences:
        concatenate_sentences_to_paragraphs(paragraphs)
    return paragraphs

In [10]:
def get_patent(doc_id):
    url_to_fetch = ES_URL.format(doc_id)

    response = urllib2.urlopen(url_to_fetch)
    patent_content = response.read()

    patent_object = json.loads(patent_content)['_source']
    return patent_object

# Actual Extraction

In [14]:
ABSTRACT_ID = "{}_abstract"
DESC_ID = "{}_description"
CLAIMS_ID = "{}_claims"

ABSTRACT_PARAGRAPH_ID = "{}_abstract_p{}"
DESCRIPTION_PARAGRAPH_ID = "{}_description_p{}"
CLAIMS_PARAGRAPH_ID = "{}_claims_p{}"

In [39]:
BATCH_SIZE = 10000

preprocessed_location = "/home/local/shalaby/" + "preprocessed_data/extended_pv_abs_desc_claims_large_sample_parags/"
TRAINING_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
VALIDATION_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
TEST_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

if not os.path.exists(preprocessed_location):
    os.makedirs(preprocessed_location)

In [40]:
def multithreaded_extended_batch_creation(start_index):

    if os.path.exists(FILE_PREFIX + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
        

        # Level 1
        # ==========
        list_abstract_paragraphs_tokens = [sentence_wordtokenizer(parag) for parag in abs_paragraphs]
        list_description_paragraphs_tokens = [sentence_wordtokenizer(parag) for parag in desc_paragraphs]
        list_claims_paragraphs_tokens = [sentence_wordtokenizer(parag) for parag in claims_paragraphs]
        
        abstract_tokens = sum(list_abstract_paragraphs_tokens, [])
        desc_tokens = sum(list_description_paragraphs_tokens, [])
        claims_tokens = sum(list_claims_paragraphs_tokens, [])
        
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        # Level 2
        # ==========
        # now add the tokens lists that will be written to the file
        
#         for i in range(len(list_abstract_paragraphs_tokens)):
#             token_lines.append([ABSTRACT_PARAGRAPH_ID.format(doc_id, i+1)] + list_abstract_paragraphs_tokens[i])
        
        for i in range(len(list_description_paragraphs_tokens)):
            token_lines.append([DESCRIPTION_PARAGRAPH_ID.format(doc_id, i+1)] + list_description_paragraphs_tokens[i])
            
#         for i in range(len(list_claims_paragraphs_tokens)):
#             token_lines.append([CLAIMS_PARAGRAPH_ID.format(doc_id, i+1)] + list_claims_paragraphs_tokens[i])
            
        
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines

In [41]:
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))

In [33]:
multithreaded_extended_batch_creation(0)

2017-03-22 12:56:50,403 : INFO : Batch creation working on 0

2017-03-22 12:56:50,474 : INFO : Doc:      0 -> Total Lines to write:       21
2017-03-22 12:56:51,317 : INFO : Finished batch 0 of size 10 in 0m 1s
2017-03-22 12:56:51,321 : INFO : For index 0, the actual number of lines written is: 344


writing batch 0


## Training

In [42]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [43]:
batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE)

In [44]:
batches

[0,
 10000,
 20000,
 30000,
 40000,
 50000,
 60000,
 70000,
 80000,
 90000,
 100000,
 110000,
 120000,
 130000,
 140000,
 150000,
 160000,
 170000,
 180000,
 190000,
 200000,
 210000,
 220000,
 230000,
 240000,
 250000]

In [46]:
try:
    pool = ThreadPool(9)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-22 12:58:20,991 : INFO : Batch creation working on 10000

2017-03-22 12:58:20,991 : INFO : Batch creation working on 20000

2017-03-22 12:58:20,992 : INFO : Batch creation working on 40000

2017-03-22 12:58:20,997 : INFO : Batch creation working on 60000

2017-03-22 12:58:20,991 : INFO : Batch creation working on 30000

2017-03-22 12:58:20,990 : INFO : Batch creation working on 0

2017-03-22 12:58:20,996 : INFO : Batch creation working on 50000

2017-03-22 12:58:21,007 : INFO : Batch creation working on 70000

2017-03-22 12:58:21,019 : INFO : Batch creation working on 80000

2017-03-22 12:59:12,801 : INFO : Doc:  20000 -> Total Lines to write:      117
2017-03-22 12:59:14,220 : INFO : Doc:  10000 -> Total Lines to write:       33
2017-03-22 12:59:16,041 : INFO : Doc:  80000 -> Total Lines to write:      180
2017-03-22 12:59:16,194 : INFO : Doc:  60000 -> Total Lines to write:       36
2017-03-22 12:59:17,896 : INFO : Doc:  30000 -> Total Lines to write:       36
2017-03-22 12:5

writing batch 0


2017-03-22 13:51:02,632 : INFO : Doc:  79000 -> Total Lines to write:   616661
2017-03-22 13:51:02,663 : INFO : Batch creation working on 90000

2017-03-22 13:51:02,972 : INFO : Doc:  90000 -> Total Lines to write:       16
2017-03-22 13:53:21,706 : INFO : Finished batch 30000 of size 10000 in 55m 1s
2017-03-22 13:53:21,715 : INFO : For index 30000, the actual number of lines written is: 643976


writing batch 30000


2017-03-22 13:53:33,934 : INFO : Finished batch 50000 of size 10000 in 55m 13s
2017-03-22 13:53:33,955 : INFO : For index 50000, the actual number of lines written is: 666009


writing batch 50000


2017-03-22 13:53:53,311 : INFO : Finished batch 20000 of size 10000 in 55m 32s
2017-03-22 13:53:53,342 : INFO : For index 20000, the actual number of lines written is: 648468


writing batch 20000


2017-03-22 13:54:19,528 : INFO : Batch creation working on 100000

2017-03-22 13:54:19,877 : INFO : Doc: 100000 -> Total Lines to write:       63
2017-03-22 13:54:24,557 : INFO : Finished batch 40000 of size 10000 in 56m 4s
2017-03-22 13:54:24,559 : INFO : For index 40000, the actual number of lines written is: 644504


writing batch 40000


2017-03-22 13:54:30,005 : INFO : Batch creation working on 110000

2017-03-22 13:54:30,632 : INFO : Doc: 110000 -> Total Lines to write:       31
2017-03-22 13:54:51,271 : INFO : Batch creation working on 120000

2017-03-22 13:54:51,649 : INFO : Doc: 120000 -> Total Lines to write:       34
2017-03-22 13:55:01,777 : INFO : Finished batch 10000 of size 10000 in 56m 41s
2017-03-22 13:55:01,809 : INFO : For index 10000, the actual number of lines written is: 654890


writing batch 10000


2017-03-22 13:55:11,920 : INFO : Finished batch 60000 of size 10000 in 56m 51s
2017-03-22 13:55:11,944 : INFO : For index 60000, the actual number of lines written is: 669832


writing batch 60000


2017-03-22 13:55:22,787 : INFO : Finished batch 80000 of size 10000 in 57m 2s
2017-03-22 13:55:22,810 : INFO : For index 80000, the actual number of lines written is: 670546


writing batch 80000


2017-03-22 13:55:26,627 : INFO : Batch creation working on 130000

2017-03-22 13:55:27,586 : INFO : Doc: 130000 -> Total Lines to write:       80
2017-03-22 13:56:01,693 : INFO : Batch creation working on 140000

2017-03-22 13:56:01,958 : INFO : Doc: 140000 -> Total Lines to write:       34
2017-03-22 13:56:18,022 : INFO : Batch creation working on 150000

2017-03-22 13:56:18,452 : INFO : Doc: 150000 -> Total Lines to write:       48
2017-03-22 13:56:23,687 : INFO : Batch creation working on 160000

2017-03-22 13:56:24,256 : INFO : Doc: 160000 -> Total Lines to write:       50
2017-03-22 13:56:43,218 : INFO : Doc:  91000 -> Total Lines to write:    67284
2017-03-22 13:57:02,548 : INFO : Finished batch 70000 of size 10000 in 58m 41s
2017-03-22 13:57:02,568 : INFO : For index 70000, the actual number of lines written is: 684962


writing batch 70000


2017-03-22 13:57:49,841 : INFO : Batch creation working on 170000

2017-03-22 13:57:50,463 : INFO : Doc: 170000 -> Total Lines to write:       41
2017-03-22 14:00:03,890 : INFO : Doc: 101000 -> Total Lines to write:    71449
2017-03-22 14:00:11,296 : INFO : Doc: 121000 -> Total Lines to write:    67722
2017-03-22 14:00:14,526 : INFO : Doc: 111000 -> Total Lines to write:    71370
2017-03-22 14:00:32,489 : INFO : Doc: 131000 -> Total Lines to write:    63562
2017-03-22 14:01:36,314 : INFO : Doc: 141000 -> Total Lines to write:    69633
2017-03-22 14:01:43,059 : INFO : Doc: 151000 -> Total Lines to write:    68519
2017-03-22 14:02:06,084 : INFO : Doc:  92000 -> Total Lines to write:   136083
2017-03-22 14:02:24,811 : INFO : Doc: 161000 -> Total Lines to write:    73352
2017-03-22 14:03:15,231 : INFO : Doc: 171000 -> Total Lines to write:    65056
2017-03-22 14:05:28,786 : INFO : Doc: 112000 -> Total Lines to write:   134824
2017-03-22 14:05:38,094 : INFO : Doc: 102000 -> Total Lines to w

writing batch 90000


2017-03-22 14:49:12,427 : INFO : Doc: 129000 -> Total Lines to write:   638099
2017-03-22 14:49:42,945 : INFO : Batch creation working on 180000

2017-03-22 14:49:43,620 : INFO : Doc: 180000 -> Total Lines to write:      100
2017-03-22 14:49:44,176 : INFO : Doc: 169000 -> Total Lines to write:   642987
2017-03-22 14:50:33,993 : INFO : Doc: 119000 -> Total Lines to write:   642771
2017-03-22 14:51:56,018 : INFO : Doc: 109000 -> Total Lines to write:   638586
2017-03-22 14:53:09,662 : INFO : Doc: 179000 -> Total Lines to write:   654335
2017-03-22 14:53:40,250 : INFO : Finished batch 130000 of size 10000 in 58m 13s
2017-03-22 14:53:40,259 : INFO : For index 130000, the actual number of lines written is: 686283


writing batch 130000


2017-03-22 14:54:19,116 : INFO : Finished batch 150000 of size 10000 in 58m 1s
2017-03-22 14:54:19,123 : INFO : For index 150000, the actual number of lines written is: 698839


writing batch 150000


2017-03-22 14:54:38,708 : INFO : Batch creation working on 190000

2017-03-22 14:54:39,489 : INFO : Doc: 190000 -> Total Lines to write:       69
2017-03-22 14:55:04,152 : INFO : Finished batch 120000 of size 10000 in 60m 13s
2017-03-22 14:55:04,173 : INFO : For index 120000, the actual number of lines written is: 707329


writing batch 120000


2017-03-22 14:55:17,721 : INFO : Batch creation working on 200000

2017-03-22 14:55:17,859 : INFO : Doc: 200000 -> Total Lines to write:       14
2017-03-22 14:56:14,130 : INFO : Doc: 181000 -> Total Lines to write:    77751
2017-03-22 14:56:14,886 : INFO : Batch creation working on 210000

2017-03-22 14:56:15,589 : INFO : Doc: 210000 -> Total Lines to write:       53
2017-03-22 14:56:27,198 : INFO : Finished batch 160000 of size 10000 in 60m 3s
2017-03-22 14:56:27,218 : INFO : For index 160000, the actual number of lines written is: 718500


writing batch 160000


2017-03-22 14:57:02,295 : INFO : Finished batch 110000 of size 10000 in 62m 32s
2017-03-22 14:57:02,321 : INFO : For index 110000, the actual number of lines written is: 711124


writing batch 110000


2017-03-22 14:57:32,045 : INFO : Batch creation working on 220000

2017-03-22 14:57:32,361 : INFO : Doc: 220000 -> Total Lines to write:       95
2017-03-22 14:57:52,932 : INFO : Finished batch 100000 of size 10000 in 63m 33s
2017-03-22 14:57:52,948 : INFO : For index 100000, the actual number of lines written is: 708929


writing batch 100000


2017-03-22 14:58:02,124 : INFO : Batch creation working on 230000

2017-03-22 14:58:02,870 : INFO : Doc: 230000 -> Total Lines to write:       51
2017-03-22 14:58:57,089 : INFO : Batch creation working on 240000

2017-03-22 14:58:57,340 : INFO : Doc: 240000 -> Total Lines to write:       40
2017-03-22 14:59:23,180 : INFO : Finished batch 170000 of size 10000 in 61m 33s
2017-03-22 14:59:23,208 : INFO : For index 170000, the actual number of lines written is: 723950


writing batch 170000


2017-03-22 15:00:27,040 : INFO : Batch creation working on 250000

2017-03-22 15:00:27,442 : INFO : Doc: 250000 -> Total Lines to write:       38
2017-03-22 15:00:55,809 : INFO : Doc: 191000 -> Total Lines to write:    74291
2017-03-22 15:02:27,261 : INFO : Doc: 201000 -> Total Lines to write:    80796
2017-03-22 15:02:31,762 : INFO : Doc: 182000 -> Total Lines to write:   150965
2017-03-22 15:02:41,629 : INFO : Doc: 211000 -> Total Lines to write:    75352
2017-03-22 15:03:38,606 : INFO : Doc: 221000 -> Total Lines to write:    72391
2017-03-22 15:03:53,262 : INFO : Doc: 231000 -> Total Lines to write:    70202
2017-03-22 15:05:25,441 : INFO : Doc: 241000 -> Total Lines to write:    74911
2017-03-22 15:06:28,377 : INFO : Doc: 251000 -> Total Lines to write:    70930
2017-03-22 15:07:28,861 : INFO : Doc: 192000 -> Total Lines to write:   149432
2017-03-22 15:08:12,118 : INFO : Doc: 212000 -> Total Lines to write:   142280
2017-03-22 15:08:34,860 : INFO : Doc: 202000 -> Total Lines to w

writing batch 250000


2017-03-22 15:23:16,549 : INFO : Doc: 195000 -> Total Lines to write:   365966
2017-03-22 15:24:03,456 : INFO : Doc: 205000 -> Total Lines to write:   373524
2017-03-22 15:24:20,513 : INFO : Doc: 215000 -> Total Lines to write:   364688
2017-03-22 15:24:23,808 : INFO : Doc: 225000 -> Total Lines to write:   358886
2017-03-22 15:24:36,811 : INFO : Doc: 186000 -> Total Lines to write:   444976
2017-03-22 15:25:15,583 : INFO : Doc: 235000 -> Total Lines to write:   360211
2017-03-22 15:26:51,782 : INFO : Doc: 245000 -> Total Lines to write:   372994
2017-03-22 15:27:58,683 : INFO : Doc: 196000 -> Total Lines to write:   436158
2017-03-22 15:28:59,738 : INFO : Doc: 206000 -> Total Lines to write:   443804
2017-03-22 15:29:06,601 : INFO : Doc: 216000 -> Total Lines to write:   433585
2017-03-22 15:29:17,133 : INFO : Doc: 226000 -> Total Lines to write:   431501
2017-03-22 15:29:19,577 : INFO : Doc: 187000 -> Total Lines to write:   514111
2017-03-22 15:30:32,530 : INFO : Doc: 236000 -> Tota

writing batch 180000


2017-03-22 15:46:50,444 : INFO : Doc: 239000 -> Total Lines to write:   668605
2017-03-22 15:48:48,403 : INFO : Finished batch 190000 of size 10000 in 54m 9s
2017-03-22 15:48:48,420 : INFO : For index 190000, the actual number of lines written is: 727668


writing batch 190000


2017-03-22 15:49:26,472 : INFO : Doc: 249000 -> Total Lines to write:   676103
2017-03-22 15:49:53,918 : INFO : Finished batch 210000 of size 10000 in 53m 39s
2017-03-22 15:49:53,939 : INFO : For index 210000, the actual number of lines written is: 720610


writing batch 210000


2017-03-22 15:50:06,372 : INFO : Finished batch 200000 of size 10000 in 54m 49s
2017-03-22 15:50:06,395 : INFO : For index 200000, the actual number of lines written is: 737799


writing batch 200000


2017-03-22 15:50:21,266 : INFO : Finished batch 220000 of size 10000 in 52m 49s
2017-03-22 15:50:21,289 : INFO : For index 220000, the actual number of lines written is: 722187


writing batch 220000


2017-03-22 15:51:51,584 : INFO : Finished batch 230000 of size 10000 in 53m 49s
2017-03-22 15:51:51,600 : INFO : For index 230000, the actual number of lines written is: 745680


writing batch 230000


2017-03-22 15:53:27,204 : INFO : Finished batch 240000 of size 10000 in 54m 30s
2017-03-22 15:53:27,206 : INFO : For index 240000, the actual number of lines written is: 753571


writing batch 240000


2017-03-22 16:14:18,462 : INFO : Doc: 145000 -> Total Lines to write:   400650
2017-03-22 16:17:52,145 : INFO : Doc: 146000 -> Total Lines to write:   469147
2017-03-22 16:21:21,346 : INFO : Doc: 147000 -> Total Lines to write:   534385
2017-03-22 16:25:10,632 : INFO : Doc: 148000 -> Total Lines to write:   609511
2017-03-22 16:28:38,253 : INFO : Doc: 149000 -> Total Lines to write:   679632
2017-03-22 16:32:06,957 : INFO : Finished batch 140000 of size 10000 in 156m 5s
2017-03-22 16:32:06,962 : INFO : For index 140000, the actual number of lines written is: 751135


writing batch 140000


In [25]:
multithreaded_extended_batch_creation(0)

2017-03-16 10:44:02,083 : INFO : Batch creation working on 0

2017-03-16 10:44:02,175 : INFO : Doc:      0 -> Total Lines to write:       78
2017-03-16 10:44:18,155 : INFO : Finished batch 0 of size 100 in 0m 16s
2017-03-16 10:44:18,160 : INFO : For index 0, the actual number of lines written is: 7800


writing batch 0


## Validation

In [48]:
DOCS_LIST = validation_docs_list
FILE_PREFIX = VALIDATION_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(validation_docs_list)

In [49]:
try:
    pool = ThreadPool(8)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-22 17:48:52,001 : INFO : Batch creation working on 0

2017-03-22 17:48:52,002 : INFO : Batch creation working on 10000

2017-03-22 17:48:52,002 : INFO : Batch creation working on 30000

2017-03-22 17:48:52,027 : INFO : Batch creation working on 60000

2017-03-22 17:48:52,015 : INFO : Batch creation working on 20000

2017-03-22 17:48:52,014 : INFO : Batch creation working on 40000

2017-03-22 17:48:52,027 : INFO : Batch creation working on 50000

2017-03-22 17:48:52,929 : INFO : Doc:  50000 -> Total Lines to write:       56
2017-03-22 17:48:52,994 : INFO : Doc:  60000 -> Total Lines to write:       59
2017-03-22 17:48:52,991 : INFO : Doc:      0 -> Total Lines to write:       17
2017-03-22 17:48:53,013 : INFO : Doc:  10000 -> Total Lines to write:       28
2017-03-22 17:48:53,062 : INFO : Doc:  30000 -> Total Lines to write:       55
2017-03-22 17:48:53,089 : INFO : Doc:  40000 -> Total Lines to write:       69
2017-03-22 17:48:53,123 : INFO : Doc:  20000 -> Total Lines to write

writing batch 60000


2017-03-22 17:52:16,905 : INFO : Doc:   1000 -> Total Lines to write:    56405
2017-03-22 17:52:26,985 : INFO : Doc:  11000 -> Total Lines to write:    59498
2017-03-22 17:52:45,531 : INFO : Doc:  41000 -> Total Lines to write:    67494
2017-03-22 17:52:53,967 : INFO : Doc:  21000 -> Total Lines to write:    66364
2017-03-22 17:52:56,284 : INFO : Doc:  51000 -> Total Lines to write:    71973
2017-03-22 17:52:57,315 : INFO : Doc:  31000 -> Total Lines to write:    69987
2017-03-22 17:56:04,978 : INFO : Doc:   2000 -> Total Lines to write:   115158
2017-03-22 17:56:10,504 : INFO : Doc:  12000 -> Total Lines to write:   119685
2017-03-22 17:56:48,789 : INFO : Doc:  52000 -> Total Lines to write:   137569
2017-03-22 17:56:50,789 : INFO : Doc:  42000 -> Total Lines to write:   135252
2017-03-22 17:56:57,055 : INFO : Doc:  22000 -> Total Lines to write:   132914
2017-03-22 17:56:58,551 : INFO : Doc:  32000 -> Total Lines to write:   135099
2017-03-22 17:59:42,091 : INFO : Doc:  13000 -> Tota

writing batch 0


2017-03-22 18:29:03,037 : INFO : Finished batch 10000 of size 10000 in 40m 11s
2017-03-22 18:29:03,041 : INFO : For index 10000, the actual number of lines written is: 638829


writing batch 10000


2017-03-22 18:30:00,309 : INFO : Finished batch 20000 of size 10000 in 41m 8s
2017-03-22 18:30:00,312 : INFO : For index 20000, the actual number of lines written is: 660093


writing batch 20000


2017-03-22 18:31:30,338 : INFO : Finished batch 30000 of size 10000 in 42m 38s
2017-03-22 18:31:30,346 : INFO : For index 30000, the actual number of lines written is: 683886


writing batch 30000


2017-03-22 18:31:38,686 : INFO : Finished batch 40000 of size 10000 in 42m 47s
2017-03-22 18:31:38,706 : INFO : For index 40000, the actual number of lines written is: 700228


writing batch 40000


2017-03-22 18:31:53,906 : INFO : Finished batch 50000 of size 10000 in 43m 2s
2017-03-22 18:31:53,912 : INFO : For index 50000, the actual number of lines written is: 707349


writing batch 50000


## Test

In [18]:
DOCS_LIST = test_docs_list
FILE_PREFIX = TEST_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(test_docs_list)

In [None]:
try:
    pool = ThreadPool(16)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-08 04:30:26,423 : INFO : Batch creation working on 0

2017-03-08 04:30:26,423 : INFO : Batch creation working on 20000

2017-03-08 04:30:26,423 : INFO : Batch creation working on 10000

2017-03-08 04:30:26,423 : INFO : Batch creation working on 30000

2017-03-08 04:30:48,394 : INFO : Doc:  10000 -> Total Lines to write:      105
2017-03-08 04:30:48,491 : INFO : Doc:      0 -> Total Lines to write:       86
2017-03-08 04:30:48,639 : INFO : Doc:  20000 -> Total Lines to write:      188
2017-03-08 04:30:53,858 : INFO : Doc:  30000 -> Total Lines to write:      514


## Stats

In [37]:
BATCH_SIZE = 1000

In [38]:
def multithreaded_extended_batch_creation(start_index):

#     if os.path.exists(FILE_PREFIX + str(start_index)):
#         info("Batch {} already exists, skipping..".format(start_index))
#         return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()
    
    len_abs_sentences = 0
    len_desc_sentences = 0
    len_desc_paragraphs = 0
    len_claims_sentences = 0
    
    len_abs_tokens = []
    len_desc_tokens = []
    len_claims_tokens = []
    
    
    len_desc_parag_tokens = []

    len_claims_paragraphs = []
    
    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        patent_doc = get_patent(doc_id)
        
        # Abstract
        abstract = patent_doc['abstract'][0]
        root = ET.fromstring(abstract.encode('utf-8'))
        abs_paragraphs = get_adjusted_paragraphs(root)
        
        # Description
        desc = patent_doc['description'][0]
        root = ET.fromstring(desc.encode('utf-8'))
        desc_paragraphs = get_adjusted_paragraphs(root)
        
        # Claims
        claims = patent_doc['claims'][0]
        root = ET.fromstring(claims.encode('utf-8'))
        claims_paragraphs = get_adjusted_paragraphs(root, conc_sentences=False)
#         claims_paragraphs = []
#         for claim in patent_doc['claims']:
#             claims_paragraphs.append(claim.strip())

        len_claims_paragraphs.append(len(claims_paragraphs))
    
        abstract_sentences = sum([get_sentences(abs_parag) for abs_parag in abs_paragraphs], [])
        desc_sentences = sum([get_sentences(desc_parag) for desc_parag in desc_paragraphs], [])
        claims_sentences = sum([get_sentences(claim_parag) for claim_parag in claims_paragraphs], [])
        
        len_abs_sentences += len(abstract_sentences)
        len_desc_sentences += len(desc_sentences)
        len_claims_sentences += len(claims_sentences)
        
        len_desc_paragraphs += len(desc_paragraphs)
        

        abstract_tokens = sum([sentence_wordtokenizer(parag) for parag in abs_paragraphs], [])
        
        len_desc_parag_tokens.extend([len(sentence_wordtokenizer(parag)) for parag in desc_paragraphs])
        desc_tokens = sum([sentence_wordtokenizer(parag) for parag in desc_paragraphs], [])
        
        claims_tokens = sum([sentence_wordtokenizer(parag) for parag in claims_paragraphs], [])
        
        
        len_abs_tokens.append(len(abstract_tokens))
        len_desc_tokens.append(len(desc_tokens))
        len_claims_tokens.append(len(claims_tokens))
        
        # lists of list of tokens
        doc_tokens_list = [doc_id]  + abstract_tokens + desc_tokens + claims_tokens
        abstract_tokens_list = [ABSTRACT_ID.format(doc_id)] + abstract_tokens
        description_tokens_list = [DESC_ID.format(doc_id)] + desc_tokens
        claims_tokens_list = [CLAIMS_ID.format(doc_id)] + claims_tokens
        
        # now add the tokens lists that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.append(abstract_tokens_list)
        token_lines.append(description_tokens_list)
        token_lines.append(claims_tokens_list)
        
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    print "Average Abstract Sentences: {}".format(len_abs_sentences/doc_index)
    print "Average Desc Sentences: {}".format(len_desc_sentences/doc_index)
    print "Average Desc Paragraphs: {}".format(len_desc_paragraphs/doc_index)
    print "Average Claims Sentences: {}".format(len_claims_sentences/doc_index)
    
    
    print "Abstract Tokens: Mean: {} - Median: {}".format(np.mean(len_abs_tokens), np.median(len_abs_tokens))
    print "Description Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_tokens), np.median(len_desc_tokens))
    print "Claims Tokens: Mean: {} - Median: {}".format(np.mean(len_claims_tokens), np.median(len_claims_tokens))
    print "Description Paragraphs Tokens: Mean: {} - Median: {}".format(np.mean(len_desc_parag_tokens), 
                                                                        np.median(len_desc_parag_tokens))
    
    print "Claims Paragraphs: Mean: {} - Median: {}".format(np.mean(len_claims_paragraphs), 
                                                                        np.median(len_claims_paragraphs))
#     write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines

In [39]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [40]:
try:
    pool = ThreadPool(8) # use just 6 because every batch requires a lot of memory
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE*10 )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-16 01:00:43,555 : INFO : Batch creation working on 0

2017-03-16 01:00:43,555 : INFO : Batch creation working on 10000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 40000

2017-03-16 01:00:43,555 : INFO : Batch creation working on 20000

2017-03-16 01:00:43,558 : INFO : Batch creation working on 50000

2017-03-16 01:00:43,559 : INFO : Batch creation working on 30000

2017-03-16 01:00:43,591 : INFO : Batch creation working on 60000

2017-03-16 01:00:43,601 : INFO : Batch creation working on 70000

2017-03-16 01:00:43,690 : INFO : Doc:  50000 -> Total Lines to write:        4
2017-03-16 01:00:43,742 : INFO : Doc:  60000 -> Total Lines to write:        4
2017-03-16 01:00:43,767 : INFO : Doc:      0 -> Total Lines to write:        4
2017-03-16 01:00:43,806 : INFO : Doc:  40000 -> Total Lines to write:        4
2017-03-16 01:00:43,846 : INFO : Doc:  20000 -> Total Lines to write:        4
2017-03-16 01:00:43,904 : INFO : Doc:  10000 -> Total Lines to write:        4
20

Average Abstract Sentences: 3
Average Desc Sentences: 236
Average Desc Paragraphs: 56
Average Claims Sentences: 19
Abstract Tokens: Mean: 118.576 - Median: 114.0
Description Tokens: Mean: 7694.354 - Median: 5256.5
Claims Tokens: Mean: 1101.352 - Median: 828.0
Description Paragraphs Tokens: Mean: 137.372194747 - Median: 113.0
Claims Paragraphs: Mean: 19.411 - Median: 16.0


2017-03-16 01:05:31,749 : INFO : Batch creation working on 80000

2017-03-16 01:05:33,147 : INFO : Doc:  80000 -> Total Lines to write:        4
2017-03-16 01:05:49,041 : INFO : Finished batch 40000 of size 1000 in 5m 5s
2017-03-16 01:05:49,060 : INFO : For index 40000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 261
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 119.666 - Median: 119.0
Description Tokens: Mean: 8411.443 - Median: 6107.0
Claims Tokens: Mean: 1111.596 - Median: 944.0
Description Paragraphs Tokens: Mean: 137.076788944 - Median: 113.0
Claims Paragraphs: Mean: 17.226 - Median: 16.0


2017-03-16 01:05:49,886 : INFO : Batch creation working on 90000

2017-03-16 01:05:50,033 : INFO : Doc:  90000 -> Total Lines to write:        4
2017-03-16 01:05:56,244 : INFO : Finished batch 50000 of size 1000 in 5m 13s
2017-03-16 01:05:56,248 : INFO : For index 50000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 265
Average Desc Paragraphs: 62
Average Claims Sentences: 17
Abstract Tokens: Mean: 116.916 - Median: 118.0
Description Tokens: Mean: 8561.637 - Median: 5910.0
Claims Tokens: Mean: 1082.717 - Median: 892.0
Description Paragraphs Tokens: Mean: 137.32235713 - Median: 112.0
Claims Paragraphs: Mean: 17.019 - Median: 15.0


2017-03-16 01:05:57,120 : INFO : Batch creation working on 100000

2017-03-16 01:05:57,343 : INFO : Doc: 100000 -> Total Lines to write:        4
2017-03-16 01:05:58,003 : INFO : Finished batch 20000 of size 1000 in 5m 14s
2017-03-16 01:05:58,006 : INFO : For index 20000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 266
Average Desc Paragraphs: 61
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.616 - Median: 121.0
Description Tokens: Mean: 8539.343 - Median: 5729.0
Claims Tokens: Mean: 1096.825 - Median: 894.0
Description Paragraphs Tokens: Mean: 138.749581607 - Median: 113.0
Claims Paragraphs: Mean: 17.846 - Median: 15.0


2017-03-16 01:05:59,193 : INFO : Batch creation working on 110000

2017-03-16 01:06:00,006 : INFO : Doc: 110000 -> Total Lines to write:        4
2017-03-16 01:06:07,053 : INFO : Finished batch 30000 of size 1000 in 5m 23s
2017-03-16 01:06:07,059 : INFO : For index 30000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 271
Average Desc Paragraphs: 64
Average Claims Sentences: 17
Abstract Tokens: Mean: 118.392 - Median: 118.0
Description Tokens: Mean: 8803.666 - Median: 6171.5
Claims Tokens: Mean: 1099.404 - Median: 923.0
Description Paragraphs Tokens: Mean: 137.510012183 - Median: 113.0
Claims Paragraphs: Mean: 17.875 - Median: 16.0


2017-03-16 01:06:08,003 : INFO : Batch creation working on 120000

2017-03-16 01:06:08,152 : INFO : Doc: 120000 -> Total Lines to write:        4
2017-03-16 01:06:14,886 : INFO : Finished batch 10000 of size 1000 in 5m 31s
2017-03-16 01:06:14,891 : INFO : For index 10000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 272
Average Desc Paragraphs: 63
Average Claims Sentences: 19
Abstract Tokens: Mean: 117.434 - Median: 116.0
Description Tokens: Mean: 8971.263 - Median: 5658.5
Claims Tokens: Mean: 1131.289 - Median: 896.0
Description Paragraphs Tokens: Mean: 141.313113334 - Median: 114.0
Claims Paragraphs: Mean: 19.008 - Median: 16.0


2017-03-16 01:06:18,493 : INFO : Finished batch 70000 of size 1000 in 5m 35s
2017-03-16 01:06:18,497 : INFO : For index 70000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 282
Average Desc Paragraphs: 66
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.501 - Median: 117.0
Description Tokens: Mean: 9176.843 - Median: 6447.5
Claims Tokens: Mean: 1058.939 - Median: 924.5
Description Paragraphs Tokens: Mean: 137.616864615 - Median: 112.0
Claims Paragraphs: Mean: 16.587 - Median: 16.0


2017-03-16 01:06:41,087 : INFO : Finished batch 60000 of size 1000 in 5m 57s
2017-03-16 01:06:41,089 : INFO : For index 60000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 71
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.293 - Median: 118.0
Description Tokens: Mean: 9770.952 - Median: 6617.0
Claims Tokens: Mean: 1059.014 - Median: 906.5
Description Paragraphs Tokens: Mean: 137.735438399 - Median: 112.0
Claims Paragraphs: Mean: 16.633 - Median: 16.0


2017-03-16 01:07:02,979 : INFO : Finished batch 120000 of size 1000 in 0m 55s
2017-03-16 01:07:02,982 : INFO : For index 120000, the actual number of lines written is: 624


Average Abstract Sentences: 3
Average Desc Sentences: 295
Average Desc Paragraphs: 67
Average Claims Sentences: 34
Abstract Tokens: Mean: 123.487179487 - Median: 123.5
Description Tokens: Mean: 9056.32051282 - Median: 6444.0
Claims Tokens: Mean: 2129.06410256 - Median: 1735.0
Description Paragraphs Tokens: Mean: 135.207771079 - Median: 111.0
Claims Paragraphs: Mean: 34.3141025641 - Median: 30.0


2017-03-16 01:10:54,091 : INFO : Finished batch 80000 of size 1000 in 5m 22s
2017-03-16 01:10:54,093 : INFO : For index 80000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 277
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.978 - Median: 118.0
Description Tokens: Mean: 9035.3 - Median: 6557.0
Claims Tokens: Mean: 1052.833 - Median: 908.0
Description Paragraphs Tokens: Mean: 137.333373866 - Median: 113.0
Claims Paragraphs: Mean: 16.561 - Median: 16.0


2017-03-16 01:11:22,433 : INFO : Finished batch 90000 of size 1000 in 5m 33s
2017-03-16 01:11:22,436 : INFO : For index 90000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 276
Average Desc Paragraphs: 65
Average Claims Sentences: 16
Abstract Tokens: Mean: 116.039 - Median: 118.0
Description Tokens: Mean: 9215.904 - Median: 6593.5
Claims Tokens: Mean: 1066.299 - Median: 932.5
Description Paragraphs Tokens: Mean: 141.239908046 - Median: 116.0
Claims Paragraphs: Mean: 16.451 - Median: 16.0


2017-03-16 01:11:36,285 : INFO : Finished batch 100000 of size 1000 in 5m 39s
2017-03-16 01:11:36,288 : INFO : For index 100000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 287
Average Desc Paragraphs: 67
Average Claims Sentences: 16
Abstract Tokens: Mean: 113.675 - Median: 117.0
Description Tokens: Mean: 9544.564 - Median: 6973.5
Claims Tokens: Mean: 1108.746 - Median: 977.5
Description Paragraphs Tokens: Mean: 141.208486211 - Median: 116.0
Claims Paragraphs: Mean: 16.438 - Median: 17.0


2017-03-16 01:11:54,205 : INFO : Finished batch 110000 of size 1000 in 5m 55s
2017-03-16 01:11:54,208 : INFO : For index 110000, the actual number of lines written is: 4000


Average Abstract Sentences: 3
Average Desc Sentences: 311
Average Desc Paragraphs: 73
Average Claims Sentences: 16
Abstract Tokens: Mean: 114.923 - Median: 118.0
Description Tokens: Mean: 10124.423 - Median: 7225.5
Claims Tokens: Mean: 1053.8 - Median: 936.5
Description Paragraphs Tokens: Mean: 138.761056974 - Median: 115.0
Claims Paragraphs: Mean: 16.433 - Median: 16.0
