Skip to content

Commit

Permalink
Import to git.
Browse files Browse the repository at this point in the history
  • Loading branch information
deni64k committed Aug 20, 2014
1 parent 363023d commit 64e7050
Show file tree
Hide file tree
Showing 17 changed files with 394 additions and 564 deletions.
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ pip-log.txt
*.mo

## paver generated files
/paver-minilib.zip# Created by http://www.gitignore.io
/paver-minilib.zip

# Created by http://www.gitignore.io

### Python ###
# Byte-compiled / optimized / DLL files
Expand Down Expand Up @@ -99,7 +101,8 @@ target/
.LSOverride

# Icon must end with two \r
Icon
Icon


# Thumbnails
._*
Expand Down
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ include requirements-dev.txt
include requirements.txt
include setup.py
include tox.ini

include mystem/*.c
include mystem/*.h
119 changes: 119 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python
# encoding: utf-8

from __future__ import unicode_literals

import codecs
import timeit
import sys

import mystem
import pymorphy2
import pymorphy2.tokenizers


para = """\
Внимательно, не мигая, сквозь редкие облака,
на лежащего в яслях ребенка издалека,
из глубины Вселенной, с другого ее конца,
звезда смотрела в пещеру. И это был взгляд Отца."""
para_utf8 = para.encode('utf-8')
para_unicode = para

_morph = pymorphy2.MorphAnalyzer()

_mystem = mystem.Mystem(grammar_info=False, entire_input=True)
_mystem.start_process()


def with_bench(func_name, profile=True):
print 'Benchmark %s' % func_name

if profile:
# import ctypes
# profiler = ctypes.CDLL("libprofiler.dylib")
# profiler.ProfilerStart()

# import yep
# yep.start('%s.prof' % func_name)

# from cStringIO import StringIO
# import cProfile
# import pstats
# import sys
# pr = cProfile.Profile()
# pr.enable()

pass

rv = timeit.timeit("""%s.__call__()""" % func_name,
"""from __main__ import %s""" % func_name,
number=10000)

if profile:
# profiler.ProfilerStop()

# yep.stop()

# pr.disable()
# s = StringIO()
# sortby = 'cumulative'
# ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
# ps.print_stats()
# sys.stderr.write(s.getvalue())

pass

print rv


def war_and_peace():
for fname in ['book1.txt', 'book2.txt']:
with codecs.open(fname, 'r', 'windows-1251') as f:
# text = f.read()
# lemmas = using_mystem(text.encode('utf-8'))
# for t in using_mystem(text):
# sys.stdout.write(t)
for line in f.xreadlines():
for t in using_mystem(line.decode('windows-1251')):
if isinstance(t, basestring):
sys.stdout.write(t)


def from_stdin():
for line in sys.stdin:
line = line.strip()
ts = using_mystem(line)
# print json.dumps(ts, ensure_ascii=False)
print 'mystem tokens=\t', ''.join(ts)
ts = using_pymorphy(line.decode('utf-8'))
print 'pymorphy tokens=\t', ' '.join(ts)


def using_mystem(text=para_unicode):
# print 'text=', text
return _mystem.lemmatize(text)


def using_pymorphy(text=para_unicode):
tokens = pymorphy2.tokenizers.simple_word_tokenize(text)
lemmas = list()
for t in tokens:
pt = _morph.parse(t)
l = pt[0].normal_form if len(pt) >= 1 else t
lemmas.append(l)
return lemmas


def main():
# print("Stdin")
# from_stdin()
# print('Война и Мир')
# war_and_peace()
with_bench('using_pymorphy', profile=True)
print ' '.join(using_pymorphy())
with_bench('using_mystem', profile=True)
print ''.join(using_mystem())

if __name__ == '__main__':
main()
79 changes: 9 additions & 70 deletions mystem/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
# -*- coding: utf-8 -*-
"""Wraps the Mystem"""

# import fcntl
# import os
# import select
# import subprocess
# import re
# import ujson as json
from __future__ import print_function

from mystem import metadata
import sys

from . import metadata


__version__ = metadata.version
Expand All @@ -17,68 +13,11 @@
__copyright__ = metadata.copyright


# def _set_non_blocking(fd):
# """
# Set the file description of the given file descriptor to non-blocking.
# """
# flags = fcntl.fcntl(fd, fcntl.F_GETFL)
# flags = flags | os.O_NONBLOCK
# fcntl.fcntl(fd, fcntl.F_SETFL, flags)


# class Mystem(object):
# def __init__(self, mystembin=None):
# self._mystembin = mystem
# self._mystemargs = ["-gidc", "--format", "json"]
# self._procin = None
# self._procout = None
# self._procout_no = None
# self._proc = None

# if self._mystembin is None:
# self._mystembin = "/Users/negval/Downloads/mystem"

# def _start_mystem(self):
# self._proc = subprocess.Popen([self._mystembin] + self._mystemargs,
# stdin=subprocess.PIPE,
# stdout=subprocess.PIPE,
# bufsize=0,
# close_fds=True)

# self._procin, self._procout = self._proc.stdin, self._proc.stdout
# self._procout_no = self._procout.fileno()
# _set_non_blocking(self._procout)

# def lemmatize(self, token):
# if self._proc is None:
# self._start_mystem()

# self._procin.write(token)
# self._procin.write('\n')
# self._procin.flush()

# out = None
# select.select([self._procout_no], [], [])
# while True:
# try:
# out = self._procout.readline()
# break
# except IOError:
# rd, _, _ = select.select([self._procout_no], [], [])
# if self._procout_no not in rd:
# raise
# obj = json.loads(out)
from ._mystem import (Mystem, autoinstall) # noqa

# return obj

# def _get_lemma(self, o):
# try:
# return o['analysis'][0]['lex'].strip()
# except (KeyError, IndexError):
# return o['text'].strip()
def main():
autoinstall(sys.stdout)

# def tokenize(self, text):
# text = re.sub(r"(\n|\r)", " ", text)
# lemmas = self.lemmatize(text)
# res = ' '.join(filter(None, map(self._get_lemma, lemmas)))
# return res
if __name__ == '__main__':
main()
Loading

0 comments on commit 64e7050

Please sign in to comment.