Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Improve command 'tokenization'
  • Loading branch information
nooodl committed Aug 1, 2013
1 parent c6a898a commit 0193a2c
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions hist.py
Expand Up @@ -2,14 +2,14 @@
from itertools import takewhile
import sys, re

def token(s):
if re.search(r'[A-Z0-9\'\"\?]', s): return False
return True

t = []
for s in sys.stdin:
if s[0] == '*': print s.strip()
t.append(' '.join(list(takewhile(token, s.split()))))
s = re.sub(r'''['"].*''', '', s)
s = re.sub(r' (for|to|is|at|from|then).*', '', s)
s = re.sub(r' [0-9].*', '', s)
s = re.sub(r' c[0-9a-f\?]{2}.*', '', s)
s = re.sub(r'\s+$', '', s)
t.append(s)

for s, c in Counter(t).most_common():
print '%d\t"%s"' % (c, s)

0 comments on commit 0193a2c

Please sign in to comment.