Skip to content

Commit

Permalink
Merge pull request #226 from aboSamoor/senna_polish1
Browse files Browse the repository at this point in the history
Adding documentation and error handling to the senna module.
  • Loading branch information
xim committed Feb 19, 2012
2 parents 19dec81 + 667e3c4 commit 96f0c36
Showing 1 changed file with 184 additions and 41 deletions.
225 changes: 184 additions & 41 deletions nltk/tag/senna.py
@@ -1,149 +1,292 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Senna tagger # Natural Language Toolkit: Interface to the Senna tagger
# #
# Copyright (C) 2001-2011 NLTK Project # Copyright (C) 2001-2012 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu> # Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://www.nltk.org/> # URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
# #
# $Id: senna.py $ # $Id: senna.py $


""" """
A module for interfacing with the SENNA tagger. A module for interfacing with the SENNA pipeline.
""" """


import os from os import path, sep
import subprocess from subprocess import Popen, PIPE
import tempfile
import nltk
from platform import architecture, system from platform import architecture, system
from nltk.tag.api import * from nltk.tag.api import TaggerI


_senna_url = 'http://ml.nec-labs.com/senna/' _senna_url = 'http://ml.nec-labs.com/senna/'



class Error(Exception):
"""Basic error handling class to be extended by the module specific
exceptions"""


class ExecutableNotFound(Error):
"""Raised if the senna executable does not exist"""


class RunFailure(Error):
"""Raised if the pipeline fails to execute"""


class SentenceMisalignment(Error):
"""Raised if the new sentence is shorter than the original one or the number
of sentences in the result is less than the input."""


class SennaTagger(TaggerI): class SennaTagger(TaggerI):
__OPS = ['pos', 'chk', 'ner'] """
A general interface of the SENNA pipeline that supports any of the
operations specified in SUPPORTED_OPERATIONS.
Applying multiple operations at once has the speed advantage. For example,
senna v2.0 will calculate the POS tags in case you are extracting the named
entities. Applying both of the operations will cost only the time of
extracting the named entities.
SENNA pipeline has a fixed maximum size of the sentences that it can read.
By default it is 1024 token/sentence. If you have larger sentences, changing
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
system specific binary should be rebuilt. Otherwise this could introduce
misalignment errors.
The input is:
- path to the directory that contains SENNA executables.
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)
Example:
.. doctest::
:options: +SKIP
>>> from nltk.tag.senna import SennaTagger
>>> pipeline = SennaTagger('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
>>> sent = u'Düsseldorf is an international business center'.split()
>>> pipeline.tag(sent)
[{'word': u'D\xfcsseldorf', 'chk': u'B-NP', 'ner': u'B-PER', 'pos': u'NNP'},
{'word': u'is', 'chk': u'B-VP', 'ner': u'O', 'pos': u'VBZ'},
{'word': u'an', 'chk': u'B-NP', 'ner': u'O', 'pos': u'DT'},
{'word': u'international', 'chk': u'I-NP', 'ner': u'O', 'pos': u'JJ'},
{'word': u'business', 'chk': u'I-NP', 'ner': u'O', 'pos': u'NN'},
{'word': u'center', 'chk': u'I-NP', 'ner': u'O','pos': u'NN'}]
"""


def __init__(self, path, operations, encoding=None, verbose=False): SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']

def __init__(self, senna_path, operations, encoding='utf-8'):
self._encoding = encoding self._encoding = encoding
self._path = os.path.normpath(path) + os.sep self._path = path.normpath(senna_path) + sep
self.operations = operations self.operations = operations


@property @property
def executable(self): def executable(self):
"""
A property that determines the system specific binary that should be
used in the pipeline. In case, the system is not known the senna binary will
be used.
"""
os_name = system() os_name = system()
if os_name == 'Linux': if os_name == 'Linux':
bits = architecture()[0] bits = architecture()[0]
if bits == '64bit': if bits == '64bit':
return os.path.join(self._path, 'senna-linux64') return path.join(self._path, 'senna-linux64')
return os.path.join(self._path, 'senna-linux32') return path.join(self._path, 'senna-linux32')
if os_name == 'Windows': if os_name == 'Windows':
return os.path.join(self._path, 'senna-win32.exe') return path.join(self._path, 'senna-win32.exe')
if os_name == 'Darwin': if os_name == 'Darwin':
return os.path.join(self._path, 'senna-osx') return path.join(self._path, 'senna-osx')
return os.path.join(self._path, 'senna') return path.join(self._path, 'senna')


def _map(self): def _map(self):
_map = {'word':0} """
A method that calculates the order of the columns that SENNA pipeline
will output the tags into. This depends on the operations being ordered.
"""
_map = {}
i = 1 i = 1
for operation in SennaTagger.__OPS: for operation in SennaTagger.SUPPORTED_OPERATIONS:
if operation in self.operations: if operation in self.operations:
_map[operation] = i _map[operation] = i
i+= 1 i+= 1
return _map return _map


def tag(self, tokens): def tag(self, tokens):
"""
Applies the specified operation(s) on a list of tokens.
"""
return self.batch_tag([tokens])[0] return self.batch_tag([tokens])[0]


def batch_tag(self, sentences): def batch_tag(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return a
list of dictionaries. Every dictionary will contain a word with its
calculated annotations/tags.
"""
encoding = self._encoding encoding = self._encoding


# Verifies the existence of the executable
if not path.isfile(self.executable):
raise ExecutableNotFound("Senna executable expected at %s but not found" %
self.executable)

# Build the senna command to run the tagger # Build the senna command to run the tagger
_senna_cmd = [self.executable, '-path', self._path, '-usrtokens', '-iobtags'] _senna_cmd = [self.executable, '-path', self._path, '-usrtokens', '-iobtags']
_senna_cmd.extend(['-'+op for op in self.operations]) _senna_cmd.extend(['-'+op for op in self.operations])


# Write the actual sentences to the temporary input file # Serialize the actual sentences to a temporary string
_input = '\n'.join((' '.join(x) for x in sentences))+'\n' _input = '\n'.join((' '.join(x) for x in sentences))+'\n'
if isinstance(_input, unicode) and encoding: if isinstance(_input, unicode) and encoding:
_input = _input.encode(encoding) _input = _input.encode(encoding)


# Run the tagger and get the output # Run the tagger and get the output
p = subprocess.Popen(_senna_cmd, p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
(stdout, stderr) = p.communicate(input=_input) (stdout, stderr) = p.communicate(input=_input)
senna_output = stdout senna_output = stdout


# Check the return code. # Check the return code.
if p.returncode != 0: if p.returncode != 0:
print stderr raise RunFailure('Senna command failed! Details: %s' % stderr)
raise OSError('Senna command failed!')


if encoding: if encoding:
senna_output = stdout.decode(encoding) senna_output = stdout.decode(encoding)


# Output the tagged sentences # Output the tagged sentences
map_ = self._map() map_ = self._map()
tagged_sentences = [[]] tagged_sentences = [[]]
sentence_index = 0
token_index = 0
for tagged_word in senna_output.strip().split("\n"): for tagged_word in senna_output.strip().split("\n"):
if not tagged_word: if not tagged_word:
tagged_sentences.append([]) tagged_sentences.append([])
sentence_index += 1
token_index = 0
continue continue
tags = tagged_word.split('\t') tags = tagged_word.split('\t')
result = {} result = {}
for tag in map_: for tag in map_:
result[tag] = tags[map_[tag]].strip() result[tag] = tags[map_[tag]].strip()
try:
result['word'] = sentences[sentence_index][token_index]
except IndexError:
raise SentenceMisalignment(
"Misalignment error occurred at sentence number %d. Possible reason"
" is that the sentence size exceeded the maximum size. Check the "
"documentation of SennaTagger class for more information."
% sentence_index)
tagged_sentences[-1].append(result) tagged_sentences[-1].append(result)
token_index += 1
return tagged_sentences return tagged_sentences




class POSTagger(SennaTagger): class POSTagger(SennaTagger):
""" """
A class for pos tagging with Senna POSTagger. The input is the paths to: A Part of Speech tagger.
- A path to the senna executables
The input is:
- path to the directory that contains SENNA executables.
- (optionally) the encoding of the input data (default:utf-8)
Example: Example:
>>> tagger = senna.POSTagger(path='/media/data/NER/senna-v2.0') .. doctest::
>>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) :options: +SKIP
>>> from nltk.tag.senna import POSTagger
>>> postagger = POSTagger('/usr/share/senna-v2.0')
>>> postagger.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
""" """
def __init__(self, path, encoding=None, verbose=False): def __init__(self, path, encoding='utf-8'):
super(POSTagger, self).__init__(path, ['pos'], encoding, verbose) super(POSTagger, self).__init__(path, ['pos'], encoding)


def batch_tag(self, sentences): def batch_tag(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return
for each sentence a list of tuples of (word, tag).
"""
tagged_sents = super(POSTagger, self).batch_tag(sentences) tagged_sents = super(POSTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)): for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])): for j in range(len(tagged_sents[i])):
tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['pos']) annotations = tagged_sents[i][j]
tagged_sents[i][j] = (annotations['word'], annotations['pos'])
return tagged_sents return tagged_sents




class NERTagger(SennaTagger): class NERTagger(SennaTagger):
def __init__(self, path, encoding=None, verbose=False): """
super(NERTagger, self).__init__(path, ['ner'], encoding, verbose) A named entity extractor.
The input is:
- path to the directory that contains SENNA executables.
- (optionally) the encoding of the input data (default:utf-8)
Example:
.. doctest::
:options: +SKIP
>>> from nltk.tag.senna import NERTagger
>>> nertagger = NERTagger('/usr/share/senna-v2.0')
>>> nertagger.tag('Shakespeare theatre was in London .'.split())
[('Shakespeare', u'B-PER'), ('theatre', u'O'), ('was', u'O'), ('in', u'O'),
('London', u'B-LOC'), ('.', u'O')]
>>> nertagger.tag('UN headquarters are in NY , USA .'.split())
[('UN', u'B-ORG'), ('headquarters', u'O'), ('are', u'O'), ('in', u'O'),
('NY', u'B-LOC'), (',', u'O'), ('USA', u'B-LOC'), ('.', u'O')]
"""
def __init__(self, path, encoding='utf-8'):
super(NERTagger, self).__init__(path, ['ner'], encoding)


def batch_tag(self, sentences): def batch_tag(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return
for each sentence a list of tuples of (word, tag).
"""
tagged_sents = super(NERTagger, self).batch_tag(sentences) tagged_sents = super(NERTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)): for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])): for j in range(len(tagged_sents[i])):
try: annotations = tagged_sents[i][j]
tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['ner']) tagged_sents[i][j] = (annotations['word'], annotations['ner'])
except:
import pdb
pdb.set_trace()
return tagged_sents return tagged_sents




class CHKTagger(SennaTagger): class CHKTagger(SennaTagger):
def __init__(self, path, encoding=None, verbose=False): """
super(CHKTagger, self).__init__(path, ['chk'], encoding, verbose) A chunker.
The input is:
- path to the directory that contains SENNA executables.
- (optionally) the encoding of the input data (default:utf-8)
Example:
.. doctest::
:options: +SKIP
>>> from nltk.tag.senna import CHKTagger
>>> chktagger = CHKTagger('/usr/share/senna-v2.0')
>>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', u'B-NP'), ('is', u'B-VP'), ('the', u'B-NP'), ('airspeed', u'I-NP'),
('of', u'B-PP'), ('an', u'B-NP'), ('unladen', u'I-NP'), ('swallow',u'I-NP'),
('?', u'O')]
"""
def __init__(self, path, encoding='utf-8'):
super(CHKTagger, self).__init__(path, ['chk'], encoding)


def batch_tag(self, sentences): def batch_tag(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return
for each sentence a list of tuples of (word, tag).
"""
tagged_sents = super(CHKTagger, self).batch_tag(sentences) tagged_sents = super(CHKTagger, self).batch_tag(sentences)
for i in range(len(tagged_sents)): for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])): for j in range(len(tagged_sents[i])):
tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['chk']) annotations = tagged_sents[i][j]
tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents return tagged_sents

0 comments on commit 96f0c36

Please sign in to comment.