-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extracting Periodic documents abstract, authors and page numbers
- Loading branch information
Showing
9 changed files
with
370 additions
and
137 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#coding: utf-8 | ||
import re | ||
from os.path import abspath, dirname, join, basename, splitext | ||
from nltk.tokenize import line_tokenize, word_tokenize | ||
from nsi.metadataextractor.preparator import Preparator | ||
from nsi.metadataextractor.xml_parser import Parser | ||
|
||
ROOT = join(abspath(dirname(__file__)), '..') | ||
|
||
class EventExtractor(object): | ||
|
||
def __init__(self, doc_dir): | ||
convertion_style = "" | ||
parse = Parser(join(ROOT, 'templates', 'event.xml')) | ||
self._template_metadata = parse.xml_template_metadata() | ||
page = self._template_metadata['page'] | ||
self._preparator = Preparator(doc_dir) | ||
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) | ||
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) | ||
self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') | ||
self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+') | ||
|
||
def _author_metadata(self): | ||
self.authors = [] | ||
breaker = self._template_metadata['author_breaker'][0] | ||
residues = self._template_metadata['author_residue'] | ||
name_corpus = self._preparator.parse_corpus('names') | ||
abnt_name = re.compile(r'(\w[.]\s)*(\w+[;])') | ||
has_only_email = False | ||
for line in self._linetokenized_onepage_doc: | ||
has_breaker = re.match(breaker, line) | ||
if has_breaker: break | ||
line_mod = set(word_tokenize(line)) | ||
has_corpus_common = bool(line_mod.intersection(name_corpus)) | ||
has_residue = bool(line_mod.intersection(residues)) | ||
if has_corpus_common and not has_residue: | ||
find_email = self._email_regex.search(line) | ||
if find_email: | ||
email = find_email.group() | ||
line = line.replace(email, '').strip() | ||
if line != '': self.authors.append(line) | ||
if not self.authors: | ||
clean_onepage_doc = self._clean_onepage_doc | ||
find_author = abnt_name.search(clean_onepage_doc) | ||
while find_author: | ||
author = find_author.group() | ||
self.authors.append(author) | ||
clean_onepage_doc = clean_onepage_doc.replace(author, '') | ||
find_author = abnt_name.search(clean_onepage_doc) | ||
return self.authors | ||
|
||
def _abstract_metadata(self): | ||
regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)') | ||
self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize() | ||
return self.abstract | ||
|
||
def _title_metadata(self): | ||
self.title = '' | ||
self.title_catcher = [] | ||
has_author = False | ||
authors = self._author_metadata() | ||
breakers = self._template_metadata['title_breaker'] | ||
for line in self._linetokenized_onepage_doc: | ||
has_breaker = bool(set(word_tokenize(line)).intersection(breakers)) | ||
has_email = self._email_regex.search(line) | ||
for author in authors: | ||
has_author = (author in line) or has_author | ||
if not has_email and not has_author and not has_breaker: | ||
self.title_catcher.append(line) | ||
else: | ||
self.title = ' '.join(self.title_catcher).capitalize() | ||
break | ||
return self.title | ||
|
||
|
||
def all_metadata(self): | ||
if self._preparator.doc_ext == '.pdf': | ||
pdf_embed_metadata = self._preparator.pdf_embed_metadata() | ||
self._pdf_num_pages = pdf_embed_metadata.numPages | ||
else: | ||
self._pdf_num_pages = 0 | ||
|
||
metadata = {'author_metadata': self._author_metadata(), | ||
'title_metadata': self._title_metadata(), | ||
'abstract_metadata': self._abstract_metadata(), | ||
'number_pages': self._pdf_num_pages | ||
} | ||
try: | ||
self._preparator.remove_converted_document() | ||
except OSError: | ||
print 'Temporary document already removed..' | ||
return metadata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#coding: utf-8 | ||
import re | ||
from os.path import abspath, dirname, join, basename, splitext | ||
from nltk.tokenize import line_tokenize, word_tokenize | ||
|
||
## Root path | ||
from nsi.metadataextractor.preparator import Preparator | ||
from nsi.metadataextractor.xml_parser import Parser | ||
|
||
#Extractor | ||
from nsi.metadataextractor.extractors.event import EventExtractor | ||
|
||
ROOT = join(abspath(dirname(__file__)), '..') | ||
|
||
class PeriodicExtractor(object): | ||
|
||
def __init__(self, doc_dir): | ||
convertion_style = "-raw" | ||
self._eventextractor = EventExtractor(doc_dir) | ||
parse = Parser(join(ROOT, 'templates', 'periodic.xml')) | ||
self._template_metadata = parse.xml_template_metadata() | ||
page = self._template_metadata['page'] | ||
self._preparator = Preparator(doc_dir) | ||
self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) | ||
self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) | ||
self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') | ||
|
||
## Event authors metadata extractor extends method to periodic author extractor | ||
def _author_metadata(self): | ||
self.authors = self._eventextractor._author_metadata() | ||
return self.authors | ||
|
||
## Event abstracts metadata extractor extends method to periodic abstract extractor | ||
def _abstract_metadata(self): | ||
self.abstract = self._eventextractor._abstract_metadata() | ||
return self.abstract | ||
|
||
def all_metadata(self): | ||
if self._preparator.doc_ext == '.pdf': | ||
pdf_embed_metadata = self._preparator.pdf_embed_metadata() | ||
self._pdf_num_pages = pdf_embed_metadata.numPages | ||
else: | ||
self._pdf_num_pages = 0 | ||
|
||
metadata = {'author_metadata': self._author_metadata(), | ||
'abstract_metadata': self._abstract_metadata(), | ||
'number_pages': self._pdf_num_pages | ||
} | ||
try: | ||
self._preparator.remove_converted_document() | ||
except OSError: | ||
print 'Temporary document already removed..' | ||
return metadata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#coding: utf-8 | ||
import re | ||
import sys | ||
from os import system, remove | ||
from os.path import abspath, dirname, join, basename, splitext | ||
from string import punctuation | ||
from pyPdf import PdfFileReader | ||
from nltk.corpus import PlaintextCorpusReader | ||
from nltk.tokenize import line_tokenize, word_tokenize | ||
|
||
ROOT = abspath(dirname(__file__)) | ||
CORPUS_PATH = join(ROOT, 'corpus') | ||
|
||
class Preparator(object): | ||
|
||
def __init__(self, doc_dir): | ||
self.doc_dir, self.doc_ext = splitext(doc_dir) | ||
self.doc_name = basename(self.doc_dir) | ||
self.temp_text_doc = ('%s.txt' %self.doc_name) | ||
|
||
def raw_text_convertion(self, page1, page2, convertion_style): | ||
if self.doc_ext == '.pdf': | ||
system("pdftotext -enc UTF-8 -f %i -l %i %s.pdf %s.txt %s" | ||
%(page1, page2, self.doc_dir, self.doc_dir, convertion_style)) | ||
raw_text = PlaintextCorpusReader(dirname(self.doc_dir), self.temp_text_doc).raw() | ||
encoded_lowertext = raw_text.decode('utf-8').lower().encode('utf-8') | ||
self.raw_text = re.sub(r'[0-9]', '', encoded_lowertext) | ||
return self.raw_text | ||
|
||
def wordtokenized_punctuation_exclusion(self, raw_text): | ||
wordtokenized_punctuation_excluded = [] | ||
wordtokenized_text = word_tokenize(raw_text) | ||
for word in wordtokenized_text: | ||
if word not in list(punctuation): | ||
wordtokenized_punctuation_excluded.append(word) | ||
return wordtokenized_punctuation_excluded | ||
|
||
def remove_converted_document(self): | ||
remove('%s.txt' %self.doc_dir) | ||
|
||
def parse_corpus(self, corpus_type): | ||
self.corpus_type = '%s.txt' %corpus_type | ||
self.corpus = line_tokenize(PlaintextCorpusReader(CORPUS_PATH, self.corpus_type).raw().lower()) | ||
if corpus_type == 'institution': | ||
for line in range(len(self.corpus)): | ||
self.corpus[line] = self.corpus[line].split(',') | ||
return self.corpus | ||
|
||
def pdf_embed_metadata(self): | ||
embed_metadata = PdfFileReader(file("%s.pdf" %self.doc_dir, "rb")) | ||
return embed_metadata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
<?xml version="1.0" encoding="UTF-8" ?> | ||
<root> | ||
<OnePage page = "1"> | ||
<metadata id = "author"> | ||
<breaker type="general">resumo</breaker> | ||
<residue type = "general">apoio,e-mail,rua,bairro,cep,avenida,universidade</residue> | ||
</metadata> | ||
<metadata id = "title"> | ||
<breaker type = "word">resumo,autores</breaker> | ||
</metadata> | ||
</OnePage> | ||
</root> |
Oops, something went wrong.