In [1]:
from bs4 import BeautifulSoup as bsoup
import re, os
import nltk
from nltk.collocations import *
from itertools import chain
import itertools 
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer


In [13]:
xml_path = "./xml-files"

In [24]:
def xml_parsing(in_data):
    
    xml_soup = bsoup(in_data, "lxml-xml")
    
    pid = xml_soup.find("publication-reference").find('doc-number').string
    
    text = ""
    
    abt = xml_soup.find('abstract')
    
    for p in abt.findAll('p'):
        text = text + p.text + " "
        
    for tag in xml_soup.find_all('claim-text'):
        text = text + tag.text
        
    return (pid, text)

In [25]:
patents_raw = {}
for xfile in os.listdir(xml_path):
    xfile = os.path.join(xml_path, xfile)
    if os.path.isfile(xfile) and xfile.endswith('.XML'):
        (pid, text) = xml_parsing(open(xfile))
        patents_raw[pid] = text

In [26]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')

In [27]:
def tokenize_patent(pid):
    
    raw_patent = patents_raw[pid].lower()
    tokenized_patents = tokenizer.tokenize(raw_patent)
    
    return (pid, tokenized_patents)

patents_tokenized = dict(tokenize_patent(pid) for pid in patents_raw.keys())

In [28]:
patents_tokenized

{'07891018': ['knee',
  'protective',
  'device',
  'for',
  'garments',
  'comprising',
  'of',
  'at',
  'least',
  'one',
  'pocket',
  'in',
  'the',
  'vicinity',
  'of',
  'the',
  'knees',
  'having',
  'an',
  'opening',
  'for',
  'receiving',
  'an',
  'insert',
  'for',
  'protecting',
  'the',
  'wearer',
  'knees',
  'garment',
  'for',
  'an',
  'infant',
  'comprising',
  'infants',
  'pants',
  'with',
  'two',
  'longitudinally',
  'extending',
  'leg',
  'portions',
  'pair',
  'of',
  'pockets',
  'sewn',
  'substantially',
  'about',
  'knee',
  'portion',
  'of',
  'each',
  'one',
  'of',
  'said',
  'leg',
  'portions',
  'each',
  'one',
  'of',
  'said',
  'pair',
  'of',
  'pockets',
  'having',
  'an',
  'upper',
  'lower',
  'and',
  'two',
  'lateral',
  'sides',
  'and',
  'at',
  'least',
  'one',
  'transverse',
  'pleat',
  'with',
  'an',
  'opening',
  'extending',
  'through',
  'one',
  'of',
  'said',
  'two',
  'lateral',
  'sides',
  'while',
  '

In [30]:
all_words = list(chain.from_iterable(patents_tokenized.values()))

In [31]:
all_words

['knee',
 'protective',
 'device',
 'for',
 'garments',
 'comprising',
 'of',
 'at',
 'least',
 'one',
 'pocket',
 'in',
 'the',
 'vicinity',
 'of',
 'the',
 'knees',
 'having',
 'an',
 'opening',
 'for',
 'receiving',
 'an',
 'insert',
 'for',
 'protecting',
 'the',
 'wearer',
 'knees',
 'garment',
 'for',
 'an',
 'infant',
 'comprising',
 'infants',
 'pants',
 'with',
 'two',
 'longitudinally',
 'extending',
 'leg',
 'portions',
 'pair',
 'of',
 'pockets',
 'sewn',
 'substantially',
 'about',
 'knee',
 'portion',
 'of',
 'each',
 'one',
 'of',
 'said',
 'leg',
 'portions',
 'each',
 'one',
 'of',
 'said',
 'pair',
 'of',
 'pockets',
 'having',
 'an',
 'upper',
 'lower',
 'and',
 'two',
 'lateral',
 'sides',
 'and',
 'at',
 'least',
 'one',
 'transverse',
 'pleat',
 'with',
 'an',
 'opening',
 'extending',
 'through',
 'one',
 'of',
 'said',
 'two',
 'lateral',
 'sides',
 'while',
 'the',
 'upper',
 'lower',
 'and',
 'other',
 'one',
 'of',
 'the',
 'two',
 'lateral',
 'sides',
 'are'

In [34]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
bigram_finder.apply_freq_filter(20)
bigram_finder.apply_word_filter(lambda w: len(w) < 3)

top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100)

In [35]:
top_100_bigrams

[('harmonic', 'flex'),
 ('centrifugally', 'balanced'),
 ('robotic', 'harmonic'),
 ('expandable', 'chuck'),
 ('group', 'consisting'),
 ('charge', 'consistent'),
 ('walk', 'behind'),
 ('improperly', 'swapped'),
 ('saw', 'resonator'),
 ('behind', 'mowing'),
 ('jute', 'fibers'),
 ('actuator', 'flag'),
 ('elastic', 'band'),
 ('lead', 'frames'),
 ('drain', 'vent'),
 ('fresh', 'food'),
 ('high', 'humidity'),
 ('paper', 'particles'),
 ('fringe', 'maker'),
 ('ultrasonic', 'test'),
 ('foot', 'pedal'),
 ('elastomeric', 'mat'),
 ('capacitor', 'devices'),
 ('loaded', 'bag'),
 ('hammermilled', 'straw'),
 ('flash', 'tank'),
 ('tank', 'receiver'),
 ('hip', 'joint'),
 ('does', 'not'),
 ('duty', 'belt'),
 ('drier', 'solid'),
 ('solid', 'phase'),
 ('removable', 'joining'),
 ('cooler', 'box'),
 ('not', 'exceed'),
 ('cross', 'sectional'),
 ('case', 'packer'),
 ('vacuum', 'electronic'),
 ('driver', 'pulley'),
 ('mowing', 'machine'),
 ('fastened', 'together'),
 ('storage', 'capacity'),
 ('bus', 'bars'),
 ('p

In [37]:
mwetokenizer = MWETokenizer(top_100_bigrams)
colloc_patents = dict((pid, mwetokenizer.tokenize(patent)) for pid,patent in patents_tokenized.items())
all_words_colloc = list(chain.from_iterable(colloc_patents.values()))
colloc_voc = list(set(all_words_colloc))

In [38]:
print(len(colloc_voc))

3372


In [40]:
pids = []
patent_words = []
for pid, tokens in colloc_patents.items():
    pids.append(pid)
    txt = ' '.join(tokens)
    patent_words.append(txt)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(input = 'content', analyzer = 'word')
tfidf_vectors = tfidf_vectorizer.fit_transform(patent_words)

In [44]:
tfidf_vectors.shape

(100, 3372)

In [45]:
save_file = open("patent_student_1.txt",'w')

In [47]:
vocab = tfidf_vectorizer.get_feature_names()

cx = tfidf_vectors.tocoo()
for i,j,v in itertools.zip_longest(cx.row, cx.col, cx.data):
    save_file.write(pids[i] + ',' + vocab[j] + ',' + '\n')

In [48]:
save_file.close()