Skip to content

Commit

Permalink
Extracting Periodic documents abstract, authors and page numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
waldofe committed Dec 21, 2012
1 parent a0b8a4e commit 9709b38
Show file tree
Hide file tree
Showing 9 changed files with 370 additions and 137 deletions.
Empty file.
92 changes: 92 additions & 0 deletions nsi/metadataextractor/extractors/event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#coding: utf-8
import re
from os.path import abspath, dirname, join, basename, splitext
from nltk.tokenize import line_tokenize, word_tokenize
from nsi.metadataextractor.preparator import Preparator
from nsi.metadataextractor.xml_parser import Parser

ROOT = join(abspath(dirname(__file__)), '..')

class EventExtractor(object):

def __init__(self, doc_dir):
convertion_style = ""
parse = Parser(join(ROOT, 'templates', 'event.xml'))
self._template_metadata = parse.xml_template_metadata()
page = self._template_metadata['page']
self._preparator = Preparator(doc_dir)
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')

def _author_metadata(self):
self.authors = []
breaker = self._template_metadata['author_breaker'][0]
residues = self._template_metadata['author_residue']
name_corpus = self._preparator.parse_corpus('names')
abnt_name = re.compile(r'(\w[.]\s)*(\w+[;])')
has_only_email = False
for line in self._linetokenized_onepage_doc:
has_breaker = re.match(breaker, line)
if has_breaker: break
line_mod = set(word_tokenize(line))
has_corpus_common = bool(line_mod.intersection(name_corpus))
has_residue = bool(line_mod.intersection(residues))
if has_corpus_common and not has_residue:
find_email = self._email_regex.search(line)
if find_email:
email = find_email.group()
line = line.replace(email, '').strip()
if line != '': self.authors.append(line)
if not self.authors:
clean_onepage_doc = self._clean_onepage_doc
find_author = abnt_name.search(clean_onepage_doc)
while find_author:
author = find_author.group()
self.authors.append(author)
clean_onepage_doc = clean_onepage_doc.replace(author, '')
find_author = abnt_name.search(clean_onepage_doc)
return self.authors

def _abstract_metadata(self):
regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)')
self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize()
return self.abstract

def _title_metadata(self):
self.title = ''
self.title_catcher = []
has_author = False
authors = self._author_metadata()
breakers = self._template_metadata['title_breaker']
for line in self._linetokenized_onepage_doc:
has_breaker = bool(set(word_tokenize(line)).intersection(breakers))
has_email = self._email_regex.search(line)
for author in authors:
has_author = (author in line) or has_author
if not has_email and not has_author and not has_breaker:
self.title_catcher.append(line)
else:
self.title = ' '.join(self.title_catcher).capitalize()
break
return self.title


def all_metadata(self):
if self._preparator.doc_ext == '.pdf':
pdf_embed_metadata = self._preparator.pdf_embed_metadata()
self._pdf_num_pages = pdf_embed_metadata.numPages
else:
self._pdf_num_pages = 0

metadata = {'author_metadata': self._author_metadata(),
'title_metadata': self._title_metadata(),
'abstract_metadata': self._abstract_metadata(),
'number_pages': self._pdf_num_pages
}
try:
self._preparator.remove_converted_document()
except OSError:
print 'Temporary document already removed..'
return metadata
53 changes: 53 additions & 0 deletions nsi/metadataextractor/extractors/periodic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#coding: utf-8
import re
from os.path import abspath, dirname, join, basename, splitext
from nltk.tokenize import line_tokenize, word_tokenize

## Root path
from nsi.metadataextractor.preparator import Preparator
from nsi.metadataextractor.xml_parser import Parser

#Extractor
from nsi.metadataextractor.extractors.event import EventExtractor

ROOT = join(abspath(dirname(__file__)), '..')

class PeriodicExtractor(object):

def __init__(self, doc_dir):
convertion_style = "-raw"
self._eventextractor = EventExtractor(doc_dir)
parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
self._template_metadata = parse.xml_template_metadata()
page = self._template_metadata['page']
self._preparator = Preparator(doc_dir)
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')

## Event authors metadata extractor extends method to periodic author extractor
def _author_metadata(self):
self.authors = self._eventextractor._author_metadata()
return self.authors

## Event abstracts metadata extractor extends method to periodic abstract extractor
def _abstract_metadata(self):
self.abstract = self._eventextractor._abstract_metadata()
return self.abstract

def all_metadata(self):
if self._preparator.doc_ext == '.pdf':
pdf_embed_metadata = self._preparator.pdf_embed_metadata()
self._pdf_num_pages = pdf_embed_metadata.numPages
else:
self._pdf_num_pages = 0

metadata = {'author_metadata': self._author_metadata(),
'abstract_metadata': self._abstract_metadata(),
'number_pages': self._pdf_num_pages
}
try:
self._preparator.remove_converted_document()
except OSError:
print 'Temporary document already removed..'
return metadata
Original file line number Diff line number Diff line change
@@ -1,65 +1,29 @@
#coding: utf-8
import re
from os import system, remove
from os.path import abspath, dirname, join, basename, splitext
from string import punctuation
from pyPdf import PdfFileReader
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import line_tokenize, word_tokenize
from xml_parser import Parser
from nsi.metadataextractor.preparator import Preparator
from nsi.metadataextractor.xml_parser import Parser

ROOT = abspath(dirname(__file__))

class Preparator(object):

def __init__(self, doc_dir):
self.doc_dir, self.doc_ext = splitext(doc_dir)
self.doc_name = basename(self.doc_dir)
self.temp_text_doc = ('%s.txt' %self.doc_name)

def raw_text_convertion(self, page1, page2):
if self.doc_ext == '.pdf':
system("pdftotext -enc UTF-8 -f %i -l %i %s.pdf %s.txt"
%(page1, page2, self.doc_dir, self.doc_dir))
raw_text = PlaintextCorpusReader(dirname(self.doc_dir), self.temp_text_doc).raw()
encoded_lowertext = raw_text.decode('utf-8').lower().encode('utf-8')
self.raw_text = re.sub(r'[0-9]', '', encoded_lowertext)
return self.raw_text

def remove_converted_document(self):
remove('%s.txt' %self.doc_dir)

def parse_corpus(self, corpus_type):
self.corpus_type = '%s.txt' %corpus_type
self.corpus_dir = join(ROOT, 'corpus')
self.corpus = line_tokenize(PlaintextCorpusReader(self.corpus_dir, self.corpus_type).raw().lower())
if corpus_type == 'institution':
for line in range(len(self.corpus)):
self.corpus[line] = self.corpus[line].split(',')
return self.corpus

def pdf_embed_metadata(self):
embed_metadata = PdfFileReader(file("%s.pdf" %self.doc_dir, "rb"))
return embed_metadata

ROOT = join(abspath(dirname(__file__)), '..')

class TccExtractor(object):

def __init__(self, doc_dir):
convertion_style = ""
parse = Parser(join(ROOT, 'templates', 'tcc.xml'))
self._template_metadata = parse.xml_template_metadata()
page = self._template_metadata['page']
pages = self._template_metadata['pages']
self._preparator = Preparator(doc_dir)
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page)
self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1])
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style)
self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines()
self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ')
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
self._wordtokenized_onepage_doc = [w for w in word_tokenize(self._raw_onepage_doc) if w not in list(punctuation)]
self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc)
self.linebreak = "\n"


def _author_metadata(self):
self.authors = []
name_corpus = self._preparator.parse_corpus('names')
Expand Down Expand Up @@ -109,8 +73,8 @@ def _institution_metadata(self):
if has_institution:
institution_corpus = self._preparator.parse_corpus('institution')
for preposition, institution in institution_corpus:
name_mod = set(institution.split())
if name_mod.intersection(self._wordtokenized_onepage_doc) == name_mod:
institution_mod = set(institution.split())
if institution_mod.intersection(self._wordtokenized_onepage_doc) == institution_mod:
self.institution = self.institution + preposition + institution.title()
break
return self.institution
Expand Down Expand Up @@ -171,88 +135,4 @@ def all_metadata(self):
self._preparator.remove_converted_document()
except OSError:
print 'Temporary document already removed..'
return metadata


class EventExtractor(object):

def __init__(self, doc_dir):
parse = Parser(join(ROOT, 'templates', 'event.xml'))
self._template_metadata = parse.xml_template_metadata()
page = self._template_metadata['page']
self._preparator = Preparator(doc_dir)
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page)
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')

def _author_metadata(self):
self.authors = []
breaker = self._template_metadata['author_breaker'][0]
residues = self._template_metadata['author_residue']
name_corpus = self._preparator.parse_corpus('names')
abnt_name = re.compile(r'(\w[.]\s)*(\w+[;])')
has_only_email = False
for line in self._linetokenized_onepage_doc:
has_breaker = re.match(breaker, line)
if has_breaker: break
line_mod = set(word_tokenize(line))
has_corpus_common = bool(line_mod.intersection(name_corpus))
has_residue = bool(line_mod.intersection(residues))
if has_corpus_common and not has_residue:
find_email = self._email_regex.search(line)
if find_email:
email = find_email.group()
line = line.replace(email, '').strip()
self.authors.append(line)
if not self.authors:
clean_onepage_doc = self._clean_onepage_doc
find_author = abnt_name.search(clean_onepage_doc)
while find_author:
author = find_author.group()
self.authors.append(author)
clean_onepage_doc = clean_onepage_doc.replace(author, '')
find_author = abnt_name.search(clean_onepage_doc)
return self.authors

def _abstract_metadata(self):
regex = re.compile(r'resumo:* (.*?) palavr(a|as)(.|\s)chav(e|es).')
self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize()
return self.abstract

def _title_metadata(self):
self.title = ''
self.title_catcher = []
has_author = False
authors = self._author_metadata()
breakers = self._template_metadata['title_breaker']
for line in self._linetokenized_onepage_doc:
has_breaker = bool(set(word_tokenize(line)).intersection(breakers))
has_email = self._email_regex.search(line)
for author in authors:
has_author = (author in line) or has_author
if not has_email and not has_author and not has_breaker:
self.title_catcher.append(line)
else:
self.title = ' '.join(self.title_catcher).capitalize()
break
return self.title


def all_metadata(self):
if self._preparator.doc_ext == '.pdf':
pdf_embed_metadata = self._preparator.pdf_embed_metadata()
self._pdf_num_pages = pdf_embed_metadata.numPages
else:
self._pdf_num_pages = 0

metadata = {'author_metadata': self._author_metadata(),
'title_metadata': self._title_metadata(),
'abstract_metadata': self._abstract_metadata(),
'number_pages': self._pdf_num_pages
}
try:
self._preparator.remove_converted_document()
except OSError:
print 'Temporary document already removed..'
return metadata
return metadata
51 changes: 51 additions & 0 deletions nsi/metadataextractor/preparator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#coding: utf-8
import re
import sys
from os import system, remove
from os.path import abspath, dirname, join, basename, splitext
from string import punctuation
from pyPdf import PdfFileReader
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import line_tokenize, word_tokenize

ROOT = abspath(dirname(__file__))
CORPUS_PATH = join(ROOT, 'corpus')

class Preparator(object):

def __init__(self, doc_dir):
self.doc_dir, self.doc_ext = splitext(doc_dir)
self.doc_name = basename(self.doc_dir)
self.temp_text_doc = ('%s.txt' %self.doc_name)

def raw_text_convertion(self, page1, page2, convertion_style):
if self.doc_ext == '.pdf':
system("pdftotext -enc UTF-8 -f %i -l %i %s.pdf %s.txt %s"
%(page1, page2, self.doc_dir, self.doc_dir, convertion_style))
raw_text = PlaintextCorpusReader(dirname(self.doc_dir), self.temp_text_doc).raw()
encoded_lowertext = raw_text.decode('utf-8').lower().encode('utf-8')
self.raw_text = re.sub(r'[0-9]', '', encoded_lowertext)
return self.raw_text

def wordtokenized_punctuation_exclusion(self, raw_text):
wordtokenized_punctuation_excluded = []
wordtokenized_text = word_tokenize(raw_text)
for word in wordtokenized_text:
if word not in list(punctuation):
wordtokenized_punctuation_excluded.append(word)
return wordtokenized_punctuation_excluded

def remove_converted_document(self):
remove('%s.txt' %self.doc_dir)

def parse_corpus(self, corpus_type):
self.corpus_type = '%s.txt' %corpus_type
self.corpus = line_tokenize(PlaintextCorpusReader(CORPUS_PATH, self.corpus_type).raw().lower())
if corpus_type == 'institution':
for line in range(len(self.corpus)):
self.corpus[line] = self.corpus[line].split(',')
return self.corpus

def pdf_embed_metadata(self):
embed_metadata = PdfFileReader(file("%s.pdf" %self.doc_dir, "rb"))
return embed_metadata
12 changes: 12 additions & 0 deletions nsi/metadataextractor/templates/periodic.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8" ?>
<root>
<OnePage page = "1">
<metadata id = "author">
<breaker type="general">resumo</breaker>
<residue type = "general">apoio,e-mail,rua,bairro,cep,avenida,universidade</residue>
</metadata>
<metadata id = "title">
<breaker type = "word">resumo,autores</breaker>
</metadata>
</OnePage>
</root>
Loading

0 comments on commit 9709b38

Please sign in to comment.