Permalink
Browse files

Do it all

  • Loading branch information...
odarbelaeze committed Jul 1, 2017
1 parent f5bfbd4 commit 47086829c2cb325645440ce4130c672c6bdc8398
View
@@ -0,0 +1,9 @@
Condor 1.2.0 (2017-06-30)
=========================
Features
--------
- Adds `from_files` method to the `Bibliography` file so that the code flows
from the scripts to the models. Several things that previously took filenames
now take file objects. We droppped python 3.5 support as well. (#86)
View
@@ -0,0 +1,5 @@
"""
Top level condor package.
"""
__version__ = "1.2.0"
@@ -11,6 +11,7 @@
from sqlalchemy.orm import relationship
from condor.models.base import AuditableMixing, DeclarativeBase
from condor.models.document import Document
class Bibliography(AuditableMixing, DeclarativeBase):
@@ -51,3 +52,39 @@ def words(self, fields, normalizer_class):
bib.raw_data(fields, normalizer_class)
for bib in self.documents
)))
@classmethod
def from_files(cls, kind, files,
full_text=None, no_cache=False,
description=None, languages=None,
show_progress_bar=False):
"""
Creates a bibliography and attached documents.
:param str kind: bib, froac, xml, or isi the kind of file to work with
:param list files: list of File objects to read from
:param str full_text: try to find the full text in this directory
:param bool no_cache: ignore cache when reading full text
:param str description: description of the bibliography
:param list languages: filter to documents of these languages only
"""
count = len(files)
description = description or f'Document set from {count} {kind} files.'
bibliography = Bibliography(description=description)
mappings = Document.mappings_from_files(
kind,
files,
full_text_path=full_text,
force=no_cache,
show_progress_bar=show_progress_bar,
)
if languages:
languages_text = ', '.join(languages)
bibliography.description += f' Filtered to {languages_text}'
mappings = [
m
for m in mappings
if m.get('language', 'english').lower() in languages
]
bibliography.documents = [Document(**mapping) for mapping in mappings]
return bibliography
View
@@ -83,38 +83,66 @@ def load_full_text(record, files, force=False):
return full_text_path
@staticmethod
def mappings_from_files(file_names, record_type,
def mappings_from_files(record_type, files,
full_text_path=None, force=False,
**kwargs):
show_progress_bar=False, **kwargs):
"""
Creates document mappings out of files.
:param file_names: paths to the files
:param files: files to read
:param record_type: type of record to extract
:param kwargs: extra fields to include in the mappings
:param full_text_path: path to look for full text pdf files
:param force: force reading the full text from pdf files
:return: an iterable over mappings
"""
iterator_class = record_iterator_class(record_type)
records = dict()
if full_text_path:
files = {
full_text_files = {
os.path.basename(path): path
for path in glob.glob(full_text_path + '**/*.pdf',
recursive=True)
}
for file in tqdm(file_names, desc='processing files', unit='file'):
else:
full_text_files = None
if show_progress_bar:
return Document._mappings_with_progress_bar(
iterator_class, files,
full_text_files, full_text_path,
force, **kwargs)
records = dict()
for file in files:
for record in iterator_class(file):
record['keywords'] = '; '.join(record.get('keywords', ''))
record.update(kwargs)
records[record['hash']] = record
if full_text_path:
record['full_text_path'] = Document.load_full_text(
record,
full_text_files,
force=force
)
return [record for record in records.values()]
@staticmethod
def _mappings_with_progress_bar(iterator_class, files,
full_text_files, full_text_path,
force, **kwargs):
records = dict()
for file in tqdm(files, desc='processing files', unit='file'):
progress_bar = tqdm(iterator_class(file), desc='processing records',
unit='record', leave=False)
for record in progress_bar:
record['keywords'] = '; '.join(record.get('keywords', ''))
record.update(kwargs)
records[record['hash']] = record
record.pop('file', None)
if full_text_path:
record['full_text_path'] = Document.load_full_text(
record,
files,
full_text_files,
force=force
)
return [record for record in records.values()]
@@ -0,0 +1 @@
!.gitignore
@@ -6,7 +6,7 @@
def record_iterator_class(record_type):
"""
Gets the record iterator for a given type
A way to abstract the construction of a record iterator class.
:param record_type: the type of file as string
View
@@ -58,10 +58,13 @@ class RecordIterator(object):
parser_class = RecordParser
def __init__(self, filename):
self.filename = filename
def __init__(self, _file):
self.file = _file
def get_buffer(self):
"""
Should get a buffer of strings to interpret to records.
"""
raise NotImplementedError('Use an specialized class')
def __iter__(self):
View
@@ -57,7 +57,6 @@ class BibtexRecordIterator(RecordIterator):
parser_class = BibtexRecordParser
def get_buffer(self):
with open(self.filename, 'r') as bibtex:
database = bibtexparser.load(bibtex)
for entry in database.entries:
yield entry
database = bibtexparser.load(self.file)
for entry in database.entries:
yield entry
View
@@ -63,6 +63,6 @@ class FroacRecordIterator(RecordIterator):
parser_class = FroacRecordParser
def get_buffer(self):
dom = minidom.parse(self.filename)
dom = minidom.parse(self.file)
for dom_element in dom.getElementsByTagName('record'):
yield dom_element
View
@@ -55,7 +55,7 @@ def get_buffer(self):
of the isi plain text files.
'''
buff = []
for line in open(self.filename):
for line in self.file:
buff.append(line)
if line[:2] == 'ER':
yield '\n'.join(buff)
@@ -94,7 +94,8 @@ def delete(database, target):
@bibliography.command()
@click.argument('kind', type=click.Choice(['xml', 'froac', 'bib', 'isi']))
@click.argument('files', nargs=-1, type=click.File(lazy=True))
@click.option('--full-text-path', '-f', 'fulltext', type=click.Path(exists=True),
@click.option('--full-text-path', '-f', 'full_text',
type=click.Path(exists=True),
help='Try to find full text pdf files in this path.')
@click.option('--no-cache', is_flag=True,
help='Do not cache the files for full text.')
@@ -105,57 +106,30 @@ def delete(database, target):
@click.option('--verbose/--quiet', default=False,
help='Be more verbose')
@requires_db
def create(database, kind, files, fulltext, no_cache, description, languages, verbose):
def create(database, **kwargs):
"""
Populates the condor database with information from the given files
kind parameter indicates what type of files you're working with.
"""
verbose = kwargs.pop('verbose', False)
if verbose:
click.echo('I\'m looking for {} records in these files:\n{}'.format(
kind, '\n'.join(file.name for file in files)
))
description = description or 'Document set from {count} {kind} files.'.format(
count=len(files),
kind=kind
)
bib = Bibliography(description=description)
database.add(bib)
database.flush()
click.echo('I\'m writing to {bib.eid}'.format(bib=bib))
mappings = Document.mappings_from_files(
list([file.name for file in files]),
kind,
full_text_path=fulltext,
force=no_cache,
bibliography_eid=bib.eid
)
if languages:
click.echo(
'Filter the following languages only: ' + ', '.join(languages)
)
bib.description += ' Filtered to {}.'.format(
', '.join(languages)
)
mappings = [
m
for m in mappings
if m.get('language', 'english').lower() in languages
]
database.bulk_insert_mappings(
Document,
mappings
)
file_names = '\n'.join(
f.name for f in kwargs.get('files', [])
if hasattr(f, 'name')
)
kind = kwargs.get('kind')
click.echo(f'I\'m looking for {kind} records in these files:')
click.echo(f'{file_names}')
kwargs['show_progress_bar'] = verbose
_bibliography = Bibliography.from_files(**kwargs)
database.add(_bibliography)
database.flush()
click.echo(f'I\'m writing to {_bibliography.eid}')
click.echo('And... I\'m done')
click.echo('The database contains {} records'.format(
Document.count(database, bib.eid)
Document.count(database, _bibliography.eid)
))
View
@@ -1,5 +1,6 @@
from setuptools import find_packages
from setuptools import setup
import condor
def get_install_requires():
@@ -19,11 +20,9 @@ def get_install_requires():
'requests==2.14.2',
]
VERSION = '1.1.1'
setup(
name='condor-ir',
version=VERSION,
version=condor.__version__,
author='Oscar D. Arbeláez-Echeverri <@odarbelaeze>, German A. Osorio-Zuluaga',
author_email='odarbelaeze@gmail.com',
packages=find_packages(),
@@ -45,7 +44,7 @@ def get_install_requires():
url='https://condor-ir.co',
download_url=(
'https://github.com/odarbelaeze/condor-ir/tarball/{}'
.format(VERSION)
.format(condor.__version__)
),
keywords=['lsa', 'search', 'search engine', 'semantics', ],
description='A latent semantic search engine implementation',
Oops, something went wrong.

0 comments on commit 4708682

Please sign in to comment.