In [1]:
import cPickle as pickle
import string
import os
import re
import urllib2

import numpy as np
import random
import time

import json

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

import xml.etree.ElementTree as ET

from thesis.utils.text import get_sentences, sentence_wordtokenizer

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
root_location = "/mnt/data2/shalaby/"
exports_location = root_location + "exported_data/"

# training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list.pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list.pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list.pkl"

In [4]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 20.8 s, sys: 2.92 s, total: 23.7 s
Wall time: 23.8 s


In [5]:
len(training_docs_list)

120156

#### Extraction Utils 

In [6]:
#ES_URL = 'http://localhost:9200/patents/patent/{}'
ES_URL = 'http://yell.dbs.ifi.lmu.de:9200/patents/patent/{}'
HEADING_TAG = 'heading'
PARAGRAPH_TAG = 'p'
UL_TAG = 'ul'
LI_TAG = 'li'
OL_TAG = 'ol'
DESC_OF_DRAWINGS_TAG = 'description-of-drawings'
MIN_PARAGRAPH_LENGTH = 50

In [7]:
def merge_with_previous(curr_node_tag, previous_node_tag, previous_node_text):
    if curr_node_tag == PARAGRAPH_TAG and previous_node_tag == HEADING_TAG:
        return True
    if previous_node_text and len(previous_node_text) < MIN_PARAGRAPH_LENGTH:
        return True
    return False
    
def get_paragraphs(root):
    paragraphs = []
    previous_node_text = None
    previous_tag = None
    for child in root:
        node_text = None
        if child.tag != DESC_OF_DRAWINGS_TAG:
            node_text = get_node_text(child)
            if node_text.strip():
                if merge_with_previous(child.tag, previous_tag, previous_node_text) and len(paragraphs) > 0:
                    paragraphs[-1] += ' ' + node_text
                else:
                    paragraphs.append(node_text)
        else:
            node_text = extract_desc_of_drawings_paragraph(child)
            paragraphs.append(node_text)
            
        previous_tag = child.tag
        previous_node_text = node_text
    return paragraphs
    
def extract_desc_of_drawings_paragraph(node):
    previous_tag = None
    sentences = []
    for child in node:
        node_text = get_node_text(child)
        if child.tag == PARAGRAPH_TAG and previous_tag == HEADING_TAG:
            sentences[-1] += ' ' + node_text
        else:
            # a paragraph in drawings descriptions is treated as a sentence
            if child.tag == PARAGRAPH_TAG:
                node_text = apply_sentence_end(node_text)
            sentences.append(node_text)
        previous_tag = child.tag
    
    return ' '.join(sentences)

def apply_sentence_end(text):
    if text and text.strip():
        text = text.strip().strip(';.')
        text += '. '
    return text

def itertext_custom(self):
    tag = self.tag
    if not isinstance(tag, basestring) and tag is not None:
        return
    if self.text:
        if tag == LI_TAG:
            yield apply_sentence_end(self.text)
        else:
            yield self.text.replace('\n',' ')
    for e in self:
        for s in e.itertext_custom():
            yield s
        if e.tail:
            yield e.tail

ET.Element.itertext_custom = itertext_custom
# def get_node_text(node):
#     node_text = ''
#     for child in node:
#         # for ul tags, get li tags as sentences
#         if child.tag == UL_TAG:
#             li_sentences = [apply_sentence_end(get_node_text_iterative(c)) for c in child]
#             child_text = ' '.join(li_sentences)
#         else:
#             child_text = get_node_text_iterative(child)
#         node_text += child_text
#     return node_text
        
get_node_text = lambda node: ''.join(node.itertext_custom()).strip()

In [8]:
def conc_paragraphs(parag1, parag2):
    return parag1.strip('.') + '.' + ' ' + parag2

def concatenate_sentences_to_paragraphs(paragraphs):
    """
    for 1 sentence paragraphs, concatenate them to the next or previous paragraph depending on context
    """
    for i in range(len(paragraphs)):
        if i >= len((paragraphs)): break
        parag = paragraphs[i]
        sentences = get_sentences(parag)
        
        if len(sentences) == 1:
            prev_paragraph = paragraphs[i-1] if i-1 >= 0 else None
            next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None

            if (next_paragraph and len(get_sentences(next_paragraph)) == 1):
                # If a series of 1 sentence length paragraphs exist, conc all of them in one paragraph
                while True:
                    if next_paragraph and len(get_sentences(next_paragraph)) == 1:
                        parag = conc_paragraphs(parag, next_paragraph)
                        paragraphs[i] = parag
                        del paragraphs[i+1]

                        # reinitialize for loop
                        next_paragraph = paragraphs[i+1] if i+1 < len(paragraphs) else None
                    else:
                        break

            # otherwise, just concatenate the 1 sentence paragraph to the previous paragraph
            elif prev_paragraph:
#                 print '============== Found prev eligible paragraph'
                prev_paragraph = conc_paragraphs(prev_paragraph, parag)
                paragraphs[i-1] = prev_paragraph
                del paragraphs[i]

            # if this is the first paragraph, then just concatenate it with the next one
            elif next_paragraph:
                parag = conc_paragraphs(parag, next_paragraph)
                paragraphs[i] = parag
                del paragraphs[i+1]

def get_adjusted_paragraphs(root):
    paragraphs = get_paragraphs(root)
    concatenate_sentences_to_paragraphs(paragraphs)
    return paragraphs

### Some Exploration of the tags we have

In [197]:
tags_set = set()
for doc_id in training_docs_list[:300]:
    url_to_fetch = ES_URL.format(doc_id)
    response = urllib2.urlopen(url_to_fetch)
    patent_content = response.read()
    patent_object = json.loads(patent_content)['_source']
    desc = patent_object['description'][0]
    root = ET.fromstring(desc.encode('utf-8'))
    doc_tags = [elem.tag for elem in root.iter()]
    tags_set.update(doc_tags)
#     tag_to_check = 'chemistry'
#     if tag_to_check in doc_tags:
#         for elem in root.iter():
#             if elem.tag == tag_to_check:
#                 print get_node_text(elem)

In [195]:
tags_set

{'br',
 'chemistry',
 'colspec',
 'description',
 'description-of-drawings',
 'entry',
 'heading',
 'img',
 'li',
 'math',
 'maths',
 'mfrac',
 'mi',
 'mn',
 'mo',
 'mover',
 'mrow',
 'mspace',
 'msqrt',
 'mstyle',
 'msub',
 'msubsup',
 'msup',
 'mtable',
 'mtd',
 'mtext',
 'mtr',
 'munder',
 'munderover',
 'p',
 'row',
 'smallcaps',
 'sub',
 'sub2',
 'sup',
 'table',
 'tables',
 'tbody',
 'tgroup',
 'thead',
 'u',
 'ul'}

## Exploration of the sub tag

In [16]:
tag_num = 0
doc_num = 0
for doc_id in training_docs_list[:1000]:
    url_to_fetch = ES_URL.format(doc_id)
    response = urllib2.urlopen(url_to_fetch)
    patent_content = response.read()
    patent_object = json.loads(patent_content)['_source']
    desc = patent_object['description'][0]
    root = ET.fromstring(desc.encode('utf-8'))
    doc_tags = [elem.tag for elem in root.iter()]
    tag_to_check = 'sub'
    tag_num += doc_tags.count(tag_to_check)
    if doc_tags.count(tag_to_check) > 0: doc_num += 1
print tag_num
print doc_num

39134
431


In [238]:
for child in root:
    if child.findall('ul'):
        print 'found it'
        print child.text
        print list(child.itertext())

found it
Accordingly, the invention provides a reheat combustion system for a gas turbine, the said system comprising:

['Accordingly, the invention provides a reheat combustion system for a gas turbine, the said system comprising:\n', '\n', '\n', '\n', 'a mixing tube adapted to be fed by products of a primary combustion zone of the gas turbine and by fuel injected by a lance;', '\n', 'a combustion chamber fed by the said mixing tube; and', '\n', 'at least one perforated acoustic screen;', '\n', 'wherein the or each said acoustic screen is provided inside the mixing tube or the said combustion chamber, at a position where it faces, but is spaced from, a perforated wall thereof; such that, in use, the said perforated wall experiences impingement cooling as it admits air into the combustion system for onward passage through the perforations of the said acoustic screen, and the acoustic screen damps acoustic pulsations in the said mixing tube and combustion chamber.', '\n', '\n', '\n', '\

## Actual Extraction

In [9]:
# url_to_fetch = ES_URL.format(training_docs_list[51])
# url_to_fetch = ES_URL.format('07249209')
# url_to_fetch = ES_URL.format('07861227')
url_to_fetch = ES_URL.format('06984387')

response = urllib2.urlopen(url_to_fetch)
patent_content = response.read()

patent_object = json.loads(patent_content)['_source']
desc = patent_object['description'][0]
root = ET.fromstring(desc.encode('utf-8'))

In [338]:
desc[:300]

u'<description id="description">\n<heading id="h-0001" level="1">RELATED APPLICATIONS</heading>\n<p id="p-0003" num="0001">This application claims priority benefit under Title 35 \xa7 19 (e) of U.S. provisional Application No. 60/135,265, filed May 21, 1999, and U.S. provisional Application No. 60/193,727,'

In [330]:
[elem.tag for elem in root.iter()]

['description',
 'heading',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'heading',
 'p',
 'p',
 'p',
 'p',
 'description-of-drawings',
 'heading',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'heading',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'p',
 'sub',
 'sub',
 'p',
 'p',
 'sub',
 'p',
 'p',
 'sub',
 'sub',
 'sub',
 'sub',
 'sub',
 'sub',
 'p',
 'sub',
 'sub',
 'sub',
 'p',
 'p',
 'p']

In [10]:
paragraphs = get_paragraphs(root)

for parag in paragraphs:
    print parag
    print 
    print len(get_sentences(parag))
    print

CROSS REFERENCE TO RELATED APPLICATIONS This application is a continuation-in-part of U.S. patent application Ser. No. 10/313,158, filed Dec. 6, 2002 now U.S. Pat. No. 7,171,652 entitled “Software Development Environment With Design Specification Verification Tool”, which is related to U.S. patent application Ser. No. 09/881,250, entitled “Automated Management Of Development Project Files Over A Network”, and U.S. patent application Ser. No. 10/059,694, entitled “Project Management Over A Network With Automated Task Schedule Update”, the content of all of which is incorporated by reference in its entirety for all purposes as if fully set forth herein.

1

FIELD OF THE INVENTION The present invention relates generally to project management and, more specifically, to automatically checking and validating a software class specification.

1

BACKGROUND OF THE INVENTION Product development projects typically require significant effort to monitor and manage. Furthermore, computer software de

In [9]:

paragraphs = get_adjusted_paragraphs(root)

for i, parag in enumerate(paragraphs):
    print i, parag
    print 
    print len(get_sentences(parag))
    print
    

0 CROSS REFERENCE TO RELATED APPLICATIONS This application is a continuation-in-part of U.S. patent application Ser. No. 10/313,158, filed Dec. 6, 2002 now U.S. Pat. No. 7,171,652 entitled “Software Development Environment With Design Specification Verification Tool”, which is related to U.S. patent application Ser. No. 09/881,250, entitled “Automated Management Of Development Project Files Over A Network”, and U.S. patent application Ser. No. 10/059,694, entitled “Project Management Over A Network With Automated Task Schedule Update”, the content of all of which is incorporated by reference in its entirety for all purposes as if fully set forth herein. FIELD OF THE INVENTION The present invention relates generally to project management and, more specifically, to automatically checking and validating a software class specification.

2

1 BACKGROUND OF THE INVENTION Product development projects typically require significant effort to monitor and manage. Furthermore, computer software de

In [13]:
sentences = get_sentences(paragraphs[129])
for i, sent in enumerate(sentences):
    print '{} => {}'.format(i, sent.encode('utf-8'))

sentence_wordtokenizer(sentences[28])

0 => APPENDIX A

 


  Author: XXX



 

  6.6.7.4.4
CHTMLProcessor Class Specification

 6.6.7.4.4.1
Base Class

 None

 6.6.7.4.4.2
Function List



 
  public:

  CHTMLProcessor(CHTTPSession& in_HTTPSession);

  ~CHTMLProcessor( );

  bool obtainDataFromHTMLFile(std::map<infoType, std::pair<std::string, int> > & inOut_Status,

    std::vector<SKeyValueInfo> & in_KeyValueInfoVector);

 private:

  void setupExtractionStateVector(std::vector<SKeyValueInfo> & in_KeyValueInfoVector);

  bool adjustMap(std::map<infoType, std::pair<std::string, int> > & inOut_Status,

    std::string in_sValue, infoType in_InfoType, int in_nRelativePriority);

  bool obtainNonBlankLine(std::string & out_sLine);

  bool obtainNonBlankLine(std::string & out_sLine, std::string in_sLine);

  bool clear(std::string & out_sLine);



 

  6.6.7.4.4.3
Defined Type List



 
  private:

  enum ELineStatus {

    ePreTargetLine = 0,

    eTargetLine,

    eFinished

  };

  struct SExtractionState {



 

     ELin

[]

In [12]:

concatenate_sentences_to_paragraphs(paragraphs)

for i, parag in enumerate(paragraphs):
    print i, parag
    print 
    print len(get_sentences(parag))
    print
    

0 CROSS REFERENCE TO RELATED APPLICATIONS This application is a continuation-in-part of U.S. patent application Ser. No. 10/313,158, filed Dec. 6, 2002 now U.S. Pat. No. 7,171,652 entitled “Software Development Environment With Design Specification Verification Tool”, which is related to U.S. patent application Ser. No. 09/881,250, entitled “Automated Management Of Development Project Files Over A Network”, and U.S. patent application Ser. No. 10/059,694, entitled “Project Management Over A Network With Automated Task Schedule Update”, the content of all of which is incorporated by reference in its entirety for all purposes as if fully set forth herein. FIELD OF THE INVENTION The present invention relates generally to project management and, more specifically, to automatically checking and validating a software class specification.

2

1 BACKGROUND OF THE INVENTION Product development projects typically require significant effort to monitor and manage. Furthermore, computer software de

In [314]:
for node in root:
    print list(node.itertext_custom())

['FIELD OF THE INVENTION']
['This invention relates to round balers that include a bale wrapping apparatus for wrapping a harvested crop material with a wrapping material, such as a net or twine, to produce shaped bales of the harvested crop material for convenient transport and storage. More particularly, the present invention relates to a round baler with bale wrapping apparatus, wherein the round baler also includes a low wrapping material indication system for providing an indication signal to an operator of the round baler, thereby signifying that the wrapping material, whether a net material or a twine material stored by the bale wrapping apparatus, is running low. In this manner, the operator is made aware of the impending need to provide more wrapping material for the bale wrapping apparatus.']
['BACKGROUND OF THE INVENTION']
[u'Round balers (commonly known simply as \u201cbalers\u201d) of the prior art generally have a bale forming chamber defined by walls of a housing and an 

In [11]:
desc



In [10]:
patent_object

{u'abstract': [u'<abstract id="abstract">\n<p id="p-0001" num="0000">The present invention relates to FMDC vaccine based on peptides having a sequence of at least 8 amino acids, which corresponds to a partial sequence of the non-structural protein region of FMDV, which was selected by immunoreactivity with FMDV-specific antibodies or by immunoreactivity with FMDV-specific T lymphocytes, and to their production and their use.</p>\n</abstract>\n'],
 u'application-country': u'US',
 u'application-date': u'19970908',
 u'application-doc-number': u'09254966',
 u'claims': [u'<claims id="claims">\n<claim id="CLM-00001" num="00001">\n<claim-text>1. A peptide consisting of a polypeptide sequence of SEQ ID NO. 45.</claim-text>\n</claim>\n</claims>\n'],
 u'classification-ipc': [{u'class': u'61',
   u'section': u'A',
   u'subclass': u'K'}],
 u'classification-national-further-classification': [u'4241861',
  u'530300',
  u'530327'],
 u'classification-national-main': u'4242161',
 u'description': [u'<de

In [53]:
root = ET.fromstring(desc.encode('utf-8'))

In [56]:
for child in root:
    print child.tag

heading
p
heading
p
p
p
p
p
p
heading
p
p
p
p
description-of-drawings
heading
p
p
p
p
p
p
p
