Permalink
Browse files

Merge pull request #226 from aboSamoor/senna_polish1

Adding documentation and error handling to the senna module.
  • Loading branch information...
2 parents 19dec81 + 667e3c4 commit 96f0c365a0852cd8ba37fe15d9e4ee0a6e19e346 @xim xim committed Feb 19, 2012
Showing with 184 additions and 41 deletions.
  1. +184 −41 nltk/tag/senna.py
View
@@ -1,149 +1,292 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Senna tagger
#
-# Copyright (C) 2001-2011 NLTK Project
+# Copyright (C) 2001-2012 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
#
# $Id: senna.py $
"""
-A module for interfacing with the SENNA tagger.
+A module for interfacing with the SENNA pipeline.
"""
-import os
-import subprocess
-import tempfile
-import nltk
+from os import path, sep
+from subprocess import Popen, PIPE
from platform import architecture, system
-from nltk.tag.api import *
+from nltk.tag.api import TaggerI
_senna_url = 'http://ml.nec-labs.com/senna/'
+
+class Error(Exception):
+ """Basic error handling class to be extended by the module specific
+ exceptions"""
+
+
+class ExecutableNotFound(Error):
+ """Raised if the senna executable does not exist"""
+
+
+class RunFailure(Error):
+ """Raised if the pipeline fails to execute"""
+
+
+class SentenceMisalignment(Error):
+ """Raised if the new sentence is shorter than the original one or the number
+ of sentences in the result is less than the input."""
+
+
class SennaTagger(TaggerI):
- __OPS = ['pos', 'chk', 'ner']
+ """
+ A general interface of the SENNA pipeline that supports any of the
+ operations specified in SUPPORTED_OPERATIONS.
+
+ Applying multiple operations at once has the speed advantage. For example,
+ senna v2.0 will calculate the POS tags in case you are extracting the named
+ entities. Applying both of the operations will cost only the time of
+ extracting the named entities.
+
+ SENNA pipeline has a fixed maximum size of the sentences that it can read.
+ By default it is 1024 token/sentence. If you have larger sentences, changing
+ the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
+ system specific binary should be rebuilt. Otherwise this could introduce
+ misalignment errors.
+
+ The input is:
+ - path to the directory that contains SENNA executables.
+ - List of the operations needed to be performed.
+ - (optionally) the encoding of the input data (default:utf-8)
+
+ Example:
+
+ .. doctest::
+ :options: +SKIP
+
+ >>> from nltk.tag.senna import SennaTagger
+ >>> pipeline = SennaTagger('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
+ >>> sent = u'Düsseldorf is an international business center'.split()
+ >>> pipeline.tag(sent)
+ [{'word': u'D\xfcsseldorf', 'chk': u'B-NP', 'ner': u'B-PER', 'pos': u'NNP'},
+ {'word': u'is', 'chk': u'B-VP', 'ner': u'O', 'pos': u'VBZ'},
+ {'word': u'an', 'chk': u'B-NP', 'ner': u'O', 'pos': u'DT'},
+ {'word': u'international', 'chk': u'I-NP', 'ner': u'O', 'pos': u'JJ'},
+ {'word': u'business', 'chk': u'I-NP', 'ner': u'O', 'pos': u'NN'},
+ {'word': u'center', 'chk': u'I-NP', 'ner': u'O','pos': u'NN'}]
+ """
- def __init__(self, path, operations, encoding=None, verbose=False):
+ SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
+
+ def __init__(self, senna_path, operations, encoding='utf-8'):
self._encoding = encoding
- self._path = os.path.normpath(path) + os.sep
+ self._path = path.normpath(senna_path) + sep
self.operations = operations
@property
def executable(self):
+ """
+ A property that determines the system specific binary that should be
+ used in the pipeline. In case, the system is not known the senna binary will
+ be used.
+ """
os_name = system()
if os_name == 'Linux':
bits = architecture()[0]
if bits == '64bit':
- return os.path.join(self._path, 'senna-linux64')
- return os.path.join(self._path, 'senna-linux32')
+ return path.join(self._path, 'senna-linux64')
+ return path.join(self._path, 'senna-linux32')
if os_name == 'Windows':
- return os.path.join(self._path, 'senna-win32.exe')
+ return path.join(self._path, 'senna-win32.exe')
if os_name == 'Darwin':
- return os.path.join(self._path, 'senna-osx')
- return os.path.join(self._path, 'senna')
+ return path.join(self._path, 'senna-osx')
+ return path.join(self._path, 'senna')
def _map(self):
- _map = {'word':0}
+ """
+ A method that calculates the order of the columns that SENNA pipeline
+ will output the tags into. This depends on the operations being ordered.
+ """
+ _map = {}
i = 1
- for operation in SennaTagger.__OPS:
+ for operation in SennaTagger.SUPPORTED_OPERATIONS:
if operation in self.operations:
_map[operation] = i
i+= 1
return _map
def tag(self, tokens):
+ """
+ Applies the specified operation(s) on a list of tokens.
+ """
return self.batch_tag([tokens])[0]
def batch_tag(self, sentences):
+ """
+ Applies the tag method over a list of sentences. This method will return a
+ list of dictionaries. Every dictionary will contain a word with its
+ calculated annotations/tags.
+ """
encoding = self._encoding
+ # Verifies the existence of the executable
+ if not path.isfile(self.executable):
+ raise ExecutableNotFound("Senna executable expected at %s but not found" %
+ self.executable)
+
# Build the senna command to run the tagger
_senna_cmd = [self.executable, '-path', self._path, '-usrtokens', '-iobtags']
_senna_cmd.extend(['-'+op for op in self.operations])
- # Write the actual sentences to the temporary input file
+ # Serialize the actual sentences to a temporary string
_input = '\n'.join((' '.join(x) for x in sentences))+'\n'
if isinstance(_input, unicode) and encoding:
_input = _input.encode(encoding)
# Run the tagger and get the output
- p = subprocess.Popen(_senna_cmd,
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
+ p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate(input=_input)
senna_output = stdout
# Check the return code.
if p.returncode != 0:
- print stderr
- raise OSError('Senna command failed!')
+ raise RunFailure('Senna command failed! Details: %s' % stderr)
if encoding:
senna_output = stdout.decode(encoding)
# Output the tagged sentences
map_ = self._map()
tagged_sentences = [[]]
+ sentence_index = 0
+ token_index = 0
for tagged_word in senna_output.strip().split("\n"):
if not tagged_word:
tagged_sentences.append([])
+ sentence_index += 1
+ token_index = 0
continue
tags = tagged_word.split('\t')
result = {}
for tag in map_:
result[tag] = tags[map_[tag]].strip()
+ try:
+ result['word'] = sentences[sentence_index][token_index]
+ except IndexError:
+ raise SentenceMisalignment(
+ "Misalignment error occurred at sentence number %d. Possible reason"
+ " is that the sentence size exceeded the maximum size. Check the "
+ "documentation of SennaTagger class for more information."
+ % sentence_index)
tagged_sentences[-1].append(result)
+ token_index += 1
return tagged_sentences
class POSTagger(SennaTagger):
"""
- A class for pos tagging with Senna POSTagger. The input is the paths to:
- - A path to the senna executables
+ A Part of Speech tagger.
+
+ The input is:
+ - path to the directory that contains SENNA executables.
+ - (optionally) the encoding of the input data (default:utf-8)
Example:
- >>> tagger = senna.POSTagger(path='/media/data/NER/senna-v2.0')
- >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ .. doctest::
+ :options: +SKIP
+
+ >>> from nltk.tag.senna import POSTagger
+ >>> postagger = POSTagger('/usr/share/senna-v2.0')
+ >>> postagger.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
- def __init__(self, path, encoding=None, verbose=False):
- super(POSTagger, self).__init__(path, ['pos'], encoding, verbose)
+ def __init__(self, path, encoding='utf-8'):
+ super(POSTagger, self).__init__(path, ['pos'], encoding)
def batch_tag(self, sentences):
+ """
+ Applies the tag method over a list of sentences. This method will return
+ for each sentence a list of tuples of (word, tag).
+ """
tagged_sents = super(POSTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
- tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['pos'])
+ annotations = tagged_sents[i][j]
+ tagged_sents[i][j] = (annotations['word'], annotations['pos'])
return tagged_sents
class NERTagger(SennaTagger):
- def __init__(self, path, encoding=None, verbose=False):
- super(NERTagger, self).__init__(path, ['ner'], encoding, verbose)
+ """
+ A named entity extractor.
+
+ The input is:
+ - path to the directory that contains SENNA executables.
+ - (optionally) the encoding of the input data (default:utf-8)
+
+ Example:
+
+ .. doctest::
+ :options: +SKIP
+
+ >>> from nltk.tag.senna import NERTagger
+ >>> nertagger = NERTagger('/usr/share/senna-v2.0')
+ >>> nertagger.tag('Shakespeare theatre was in London .'.split())
+ [('Shakespeare', u'B-PER'), ('theatre', u'O'), ('was', u'O'), ('in', u'O'),
+ ('London', u'B-LOC'), ('.', u'O')]
+ >>> nertagger.tag('UN headquarters are in NY , USA .'.split())
+ [('UN', u'B-ORG'), ('headquarters', u'O'), ('are', u'O'), ('in', u'O'),
+ ('NY', u'B-LOC'), (',', u'O'), ('USA', u'B-LOC'), ('.', u'O')]
+ """
+ def __init__(self, path, encoding='utf-8'):
+ super(NERTagger, self).__init__(path, ['ner'], encoding)
def batch_tag(self, sentences):
+ """
+ Applies the tag method over a list of sentences. This method will return
+ for each sentence a list of tuples of (word, tag).
+ """
tagged_sents = super(NERTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
- try:
- tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['ner'])
- except:
- import pdb
- pdb.set_trace()
+ annotations = tagged_sents[i][j]
+ tagged_sents[i][j] = (annotations['word'], annotations['ner'])
return tagged_sents
class CHKTagger(SennaTagger):
- def __init__(self, path, encoding=None, verbose=False):
- super(CHKTagger, self).__init__(path, ['chk'], encoding, verbose)
+ """
+ A chunker.
+
+ The input is:
+ - path to the directory that contains SENNA executables.
+ - (optionally) the encoding of the input data (default:utf-8)
+
+ Example:
+
+ .. doctest::
+ :options: +SKIP
+
+ >>> from nltk.tag.senna import CHKTagger
+ >>> chktagger = CHKTagger('/usr/share/senna-v2.0')
+ >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ [('What', u'B-NP'), ('is', u'B-VP'), ('the', u'B-NP'), ('airspeed', u'I-NP'),
+ ('of', u'B-PP'), ('an', u'B-NP'), ('unladen', u'I-NP'), ('swallow',u'I-NP'),
+ ('?', u'O')]
+ """
+ def __init__(self, path, encoding='utf-8'):
+ super(CHKTagger, self).__init__(path, ['chk'], encoding)
def batch_tag(self, sentences):
+ """
+ Applies the tag method over a list of sentences. This method will return
+ for each sentence a list of tuples of (word, tag).
+ """
tagged_sents = super(CHKTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
- tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['chk'])
+ annotations = tagged_sents[i][j]
+ tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents

0 comments on commit 96f0c36

Please sign in to comment.