In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
import os
import re
import urllib2

import numpy as np
import random
import time

import json

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

import xml.etree.ElementTree as ET

import nltk

from thesis.utils.text import get_sentences, sentence_wordtokenizer

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
root_location = "/mnt/data2/shalaby/"
exports_location = root_location + "exported_data/"

# training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list.pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list.pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list.pkl"

In [4]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 17 s, sys: 748 ms, total: 17.8 s
Wall time: 17.8 s


In [5]:
len(training_docs_list)

120156

#### Extraction Utils 

In [6]:
ES_URL = 'http://localhost:9200/patents/patent/{}'
HEADING_TAG = 'heading'
PARAGRAPH_TAG = 'p'
UL_TAG = 'ul'
LI_TAG = 'li'
OL_TAG = 'ol'
DESC_OF_DRAWINGS_TAG = 'description-of-drawings'
MIN_PARAGRAPH_LENGTH = 50

In [7]:
def merge_with_previous(curr_node_tag, previous_node_tag, previous_node_text):
    if curr_node_tag == PARAGRAPH_TAG and previous_node_tag == HEADING_TAG:
        return True
    if previous_node_text and len(previous_node_text) < MIN_PARAGRAPH_LENGTH:
        return True
    return False
    
def get_paragraphs(root):
    paragraphs = []
    previous_node_text = None
    previous_tag = None
    for child in root:
        node_text = None
        if child.tag != DESC_OF_DRAWINGS_TAG:
            node_text = get_node_text(child)
            if node_text.strip():
                if merge_with_previous(child.tag, previous_tag, previous_node_text) and len(paragraphs) > 0:
                    paragraphs[-1] += ' ' + node_text
                else:
                    paragraphs.append(node_text)
        else:
            node_text = extract_desc_of_drawings_paragraph(child)
            paragraphs.append(node_text)
            
        previous_tag = child.tag
        previous_node_text = node_text
    return paragraphs
    
def extract_desc_of_drawings_paragraph(node):
    previous_tag = None
    sentences = []
    for child in node:
        node_text = get_node_text(child)
        if child.tag == PARAGRAPH_TAG and previous_tag == HEADING_TAG:
            sentences[-1] += ' ' + node_text
        else:
            # a paragraph in drawings descriptions is treated as a sentence
            if child.tag == PARAGRAPH_TAG:
                node_text = apply_sentence_end(node_text)
            sentences.append(node_text)
        previous_tag = child.tag
    
    return ' '.join(sentences)

def apply_sentence_end(text):
    if text and text.strip():
        text = text.strip().strip(';.')
        text += '. '
    return text

def itertext_custom(self):
    tag = self.tag
    if not isinstance(tag, basestring) and tag is not None:
        return
    if self.text:
        if tag == LI_TAG:
            yield apply_sentence_end(self.text)
        else:
            yield self.text.replace('\n',' ')
    for e in self:
        for s in e.itertext_custom():
            yield s
        if e.tail:
            yield e.tail

ET.Element.itertext_custom = itertext_custom
# def get_node_text(node):
#     node_text = ''
#     for child in node:
#         # for ul tags, get li tags as sentences
#         if child.tag == UL_TAG:
#             li_sentences = [apply_sentence_end(get_node_text_iterative(c)) for c in child]
#             child_text = ' '.join(li_sentences)
#         else:
#             child_text = get_node_text_iterative(child)
#         node_text += child_text
#     return node_text
        
get_node_text = lambda node: ''.join(node.itertext_custom()).strip()

In [8]:
def conc_paragraphs(parag1, parag2):
    return parag1.strip('.') + '.' + ' ' + parag2

def concatenate_sentences_to_paragraphs(paragraphs):
    """
    for 1 sentence paragraphs, concatenate them to the next or previous paragraph depending on context
    """
    for i in range(len(paragraphs)):
        if i >= len((paragraphs)): break
        parag = paragraphs[i]
        sentences = get_sentences(parag)
        
        if len(sentences) == 1:
            prev_paragraph = paragraphs[i-1] if i-1 >= 0 else None
            next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None

            if (next_paragraph and len(get_sentences(next_paragraph)) == 1):
                # If a series of 1 sentence length paragraphs exist, conc all of them in one paragraph
                while True:
                    if next_paragraph and len(get_sentences(next_paragraph)) == 1:
                        parag = conc_paragraphs(parag, next_paragraph)
                        paragraphs[i] = parag
                        del paragraphs[i+1]

                        # reinitialize for loop
                        next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None
                    else:
                        break

            # otherwise, just concatenate the 1 sentence paragraph to the previous paragraph
            elif prev_paragraph:
#                 print '============== Found prev eligible paragraph'
                prev_paragraph = conc_paragraphs(prev_paragraph, parag)
                paragraphs[i-1] = prev_paragraph
                del paragraphs[i]

            # if this is the first paragraph, then just concatenate it with the next one
            elif next_paragraph:
                parag = conc_paragraphs(parag, next_paragraph)
                paragraphs[i] = parag
                del paragraphs[i+1]

def get_adjusted_paragraphs(root):
    paragraphs = get_paragraphs(root)
    concatenate_sentences_to_paragraphs(paragraphs)
    return paragraphs

In [9]:
def get_patent_description(doc_id):
    url_to_fetch = ES_URL.format(doc_id)

    response = urllib2.urlopen(url_to_fetch)
    patent_content = response.read()

    patent_object = json.loads(patent_content)['_source']
    desc = patent_object['description'][0]
    return desc

# Actual Extraction

In [10]:
SENTENCE_ID = "{}_p{}_s{}"
PARAGRAPH_ID = "{}_p{}"

In [11]:
BATCH_SIZE = 10000

preprocessed_location = "/mnt/data/shalaby/" + "preprocessed_data/"
TRAINING_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
VALIDATION_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
TEST_PREPROCESSED_FILES_PREFIX = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

In [12]:
def multithreaded_extended_batch_creation(start_index):

    if os.path.exists(FILE_PREFIX + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    token_lines = []
    start_time = time.time()

    for doc_index, doc_id in enumerate(DOCS_LIST[start_index:]):
        desc = get_patent_description(doc_id)
        root = ET.fromstring(desc.encode('utf-8'))

        # lists of list of tokens in sentence, paragraph
        all_sentences_tokens_list = []
        all_paragraphs_tokens_list = []
        
        # just one list of all tokens in the doc
        doc_tokens_list = [doc_id] 
        
        # get paragraphs
        paragraphs = get_adjusted_paragraphs(root)

        for parag_index, parag in enumerate(paragraphs):
            paragraph_id = PARAGRAPH_ID.format(doc_id, parag_index+1)
            curr_paragraph_tokens = [paragraph_id]
            
            # get sentences in paragraphs
            parag_sentences = get_sentences(parag)
            for sentence_index, parag_sent in enumerate(parag_sentences):
                sentence_id = SENTENCE_ID.format(doc_id, parag_index+1, sentence_index+1)
                curr_sentence_tokens = sentence_wordtokenizer(parag_sent)
                
                if len(curr_sentence_tokens) > 0:
                    all_sentences_tokens_list.append([sentence_id] + curr_sentence_tokens)

                    # we do this incrementally instead of tokenizing the whole paragraph/doc again
                    curr_paragraph_tokens.extend(curr_sentence_tokens)
                    doc_tokens_list.extend(curr_sentence_tokens)
                
            all_paragraphs_tokens_list.append(curr_paragraph_tokens)
            del parag_sentences

        # now add the document tokens and the sentence tokens to the list that will be written to the file
        token_lines.append(doc_tokens_list)
        token_lines.extend(all_paragraphs_tokens_list)
        token_lines.extend(all_sentences_tokens_list)

        del paragraphs, desc
        if doc_index % 1000 == 0: info("Doc: {:6} -> Total Lines to write: {:8}".format(start_index + doc_index, len(token_lines)))
        if doc_index >= BATCH_SIZE - 1:
            break
    duration = time.time() - start_time
    info("Finished batch {} of size {:d} in {:.0f}m {:.0f}s".format(start_index, BATCH_SIZE, * divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    
    write_batch(FILE_PREFIX, token_lines, start_index)
    del token_lines

In [13]:
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))

## Training

In [14]:
DOCS_LIST = training_docs_list
FILE_PREFIX = TRAINING_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(training_docs_list)

In [15]:
try:
    pool = ThreadPool(6) # use just 6 because every batch requires a lot of memory
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-08 02:09:14,865 : INFO : Batch creation working on 10000

2017-03-08 02:09:14,865 : INFO : Batch creation working on 0

2017-03-08 02:09:14,867 : INFO : Batch creation working on 50000

2017-03-08 02:09:14,865 : INFO : Batch creation working on 40000

2017-03-08 02:09:14,865 : INFO : Batch creation working on 30000

2017-03-08 02:09:14,865 : INFO : Batch creation working on 20000

2017-03-08 02:09:14,974 : INFO : Doc:  50000 -> Total Lines to write:       75
2017-03-08 02:09:15,030 : INFO : Doc:  20000 -> Total Lines to write:      267
2017-03-08 02:09:15,033 : INFO : Doc:      0 -> Total Lines to write:      174
2017-03-08 02:09:15,039 : INFO : Doc:  30000 -> Total Lines to write:      327
2017-03-08 02:09:15,077 : INFO : Doc:  40000 -> Total Lines to write:      263
2017-03-08 02:09:15,131 : INFO : Doc:  10000 -> Total Lines to write:      283
2017-03-08 02:10:52,977 : INFO : Doc:  21000 -> Total Lines to write:   329036
2017-03-08 02:10:53,302 : INFO : Doc:  51000 -> Total L

writing batch 20000


2017-03-08 02:28:45,406 : INFO : Batch creation working on 60000

2017-03-08 02:28:45,809 : INFO : Doc:  60000 -> Total Lines to write:      157
2017-03-08 02:29:24,680 : INFO : Doc:  49000 -> Total Lines to write:  3099990
2017-03-08 02:29:51,038 : INFO : Finished batch 50000 of size 10000 in 20m 36s
2017-03-08 02:29:51,041 : INFO : For index 50000, the actual number of lines written is: 3432568


writing batch 50000


2017-03-08 02:29:53,004 : INFO : Finished batch 0 of size 10000 in 20m 38s
2017-03-08 02:29:53,006 : INFO : For index 0, the actual number of lines written is: 3173898


writing batch 0


2017-03-08 02:30:15,113 : INFO : Batch creation working on 70000

2017-03-08 02:30:15,132 : INFO : Batch creation working on 80000

2017-03-08 02:30:15,823 : INFO : Doc:  70000 -> Total Lines to write:      346
2017-03-08 02:30:16,160 : INFO : Doc:  80000 -> Total Lines to write:     1184
2017-03-08 02:30:43,971 : INFO : Finished batch 10000 of size 10000 in 21m 29s
2017-03-08 02:30:43,976 : INFO : For index 10000, the actual number of lines written is: 3195411


writing batch 10000


2017-03-08 02:31:05,493 : INFO : Batch creation working on 90000

2017-03-08 02:31:06,562 : INFO : Doc:  90000 -> Total Lines to write:      172
2017-03-08 02:31:21,877 : INFO : Finished batch 30000 of size 10000 in 22m 7s
2017-03-08 02:31:21,881 : INFO : For index 30000, the actual number of lines written is: 3310449


writing batch 30000


2017-03-08 02:31:43,766 : INFO : Batch creation working on 100000

2017-03-08 02:31:44,433 : INFO : Doc: 100000 -> Total Lines to write:      236
2017-03-08 02:31:50,654 : INFO : Doc:  61000 -> Total Lines to write:   367284
2017-03-08 02:32:41,747 : INFO : Finished batch 40000 of size 10000 in 23m 27s
2017-03-08 02:32:41,751 : INFO : For index 40000, the actual number of lines written is: 3440447


writing batch 40000


2017-03-08 02:33:04,569 : INFO : Batch creation working on 110000

2017-03-08 02:33:07,535 : INFO : Doc: 110000 -> Total Lines to write:      839
2017-03-08 02:33:31,181 : INFO : Doc:  81000 -> Total Lines to write:   344219
2017-03-08 02:33:36,290 : INFO : Doc:  71000 -> Total Lines to write:   349699
2017-03-08 02:34:22,483 : INFO : Doc:  91000 -> Total Lines to write:   343071
2017-03-08 02:34:42,892 : INFO : Doc:  62000 -> Total Lines to write:   683732
2017-03-08 02:35:04,733 : INFO : Doc: 101000 -> Total Lines to write:   355971
2017-03-08 02:36:00,055 : INFO : Doc: 111000 -> Total Lines to write:   385464
2017-03-08 02:36:23,209 : INFO : Doc:  72000 -> Total Lines to write:   687879
2017-03-08 02:36:31,062 : INFO : Doc:  82000 -> Total Lines to write:   712707
2017-03-08 02:37:24,486 : INFO : Doc:  92000 -> Total Lines to write:   698973
2017-03-08 02:37:28,270 : INFO : Doc:  63000 -> Total Lines to write:  1037330
2017-03-08 02:37:48,410 : INFO : Doc: 102000 -> Total Lines to w

writing batch 60000


2017-03-08 02:57:03,185 : INFO : Doc: 118000 -> Total Lines to write:  3116340
2017-03-08 02:57:12,487 : INFO : Batch creation working on 120000

2017-03-08 02:57:12,642 : INFO : Doc: 120000 -> Total Lines to write:      136
2017-03-08 02:57:27,585 : INFO : Doc: 109000 -> Total Lines to write:  3235133
2017-03-08 02:57:32,701 : INFO : Doc:  99000 -> Total Lines to write:  3232649
2017-03-08 02:57:34,690 : INFO : Doc:  89000 -> Total Lines to write:  3181306
2017-03-08 02:57:42,056 : INFO : Finished batch 120000 of size 10000 in 0m 30s
2017-03-08 02:57:42,059 : INFO : For index 120000, the actual number of lines written is: 56442


writing batch 120000


2017-03-08 02:58:44,958 : INFO : Finished batch 70000 of size 10000 in 28m 30s
2017-03-08 02:58:44,962 : INFO : For index 70000, the actual number of lines written is: 3481088


writing batch 70000


2017-03-08 02:59:38,516 : INFO : Doc: 119000 -> Total Lines to write:  3454499
2017-03-08 03:00:18,424 : INFO : Finished batch 90000 of size 10000 in 29m 13s
2017-03-08 03:00:18,429 : INFO : For index 90000, the actual number of lines written is: 3605520


writing batch 90000


2017-03-08 03:00:22,717 : INFO : Finished batch 100000 of size 10000 in 28m 39s
2017-03-08 03:00:22,721 : INFO : For index 100000, the actual number of lines written is: 3594460


writing batch 100000


2017-03-08 03:00:58,523 : INFO : Finished batch 80000 of size 10000 in 30m 43s
2017-03-08 03:00:58,528 : INFO : For index 80000, the actual number of lines written is: 3561721


writing batch 80000


2017-03-08 03:01:46,356 : INFO : Finished batch 110000 of size 10000 in 28m 42s
2017-03-08 03:01:46,361 : INFO : For index 110000, the actual number of lines written is: 3696658


writing batch 110000


In [None]:
multithreaded_extended_batch_creation(10000)

## Validation

In [16]:
DOCS_LIST = validation_docs_list
FILE_PREFIX = VALIDATION_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(validation_docs_list)

In [17]:
try:
    pool = ThreadPool(16)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-08 03:21:11,761 : INFO : Batch creation working on 20000

2017-03-08 03:21:11,761 : INFO : Batch creation working on 10000

2017-03-08 03:21:11,760 : INFO : Batch creation working on 0

2017-03-08 03:21:12,077 : INFO : Doc:      0 -> Total Lines to write:       72
2017-03-08 03:21:12,111 : INFO : Doc:  20000 -> Total Lines to write:      116
2017-03-08 03:21:12,166 : INFO : Doc:  10000 -> Total Lines to write:      372
2017-03-08 03:23:09,621 : INFO : Doc:   1000 -> Total Lines to write:   307968
2017-03-08 03:23:25,492 : INFO : Doc:  11000 -> Total Lines to write:   343452
2017-03-08 03:23:33,229 : INFO : Doc:  21000 -> Total Lines to write:   358265
2017-03-08 03:25:04,002 : INFO : Doc:   2000 -> Total Lines to write:   607301
2017-03-08 03:25:40,539 : INFO : Doc:  12000 -> Total Lines to write:   696745
2017-03-08 03:25:52,468 : INFO : Doc:  22000 -> Total Lines to write:   710605
2017-03-08 03:27:12,268 : INFO : Doc:   3000 -> Total Lines to write:   910284
2017-03-08 03:27

writing batch 0


2017-03-08 03:42:57,921 : INFO : Doc:  29000 -> Total Lines to write:  3205532
2017-03-08 03:43:31,627 : INFO : Finished batch 10000 of size 10000 in 22m 20s
2017-03-08 03:43:31,632 : INFO : For index 10000, the actual number of lines written is: 3300118


writing batch 10000


2017-03-08 03:44:18,162 : INFO : Finished batch 20000 of size 10000 in 23m 6s
2017-03-08 03:44:18,167 : INFO : For index 20000, the actual number of lines written is: 3405385


writing batch 20000


## Test

In [18]:
DOCS_LIST = test_docs_list
FILE_PREFIX = TEST_PREPROCESSED_FILES_PREFIX
SAMPLE_SIZE = len(test_docs_list)

In [None]:
try:
    pool = ThreadPool(16)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(SAMPLE_SIZE, BATCH_SIZE)[0]+1) * BATCH_SIZE, BATCH_SIZE )
    indices = pool.map(multithreaded_extended_batch_creation, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()

2017-03-08 04:30:26,423 : INFO : Batch creation working on 0

2017-03-08 04:30:26,423 : INFO : Batch creation working on 20000

2017-03-08 04:30:26,423 : INFO : Batch creation working on 10000

2017-03-08 04:30:26,423 : INFO : Batch creation working on 30000

2017-03-08 04:30:48,394 : INFO : Doc:  10000 -> Total Lines to write:      105
2017-03-08 04:30:48,491 : INFO : Doc:      0 -> Total Lines to write:       86
2017-03-08 04:30:48,639 : INFO : Doc:  20000 -> Total Lines to write:      188
2017-03-08 04:30:53,858 : INFO : Doc:  30000 -> Total Lines to write:      514
