Permalink
Browse files

Improve command 'tokenization'

  • Loading branch information...
1 parent c6a898a commit 0193a2c193931417be71bd059c9139cf2d34991b nooodl committed Aug 1, 2013
Showing with 6 additions and 6 deletions.
  1. +6 −6 hist.py
View
12 hist.py
@@ -2,14 +2,14 @@
from itertools import takewhile
import sys, re
-def token(s):
- if re.search(r'[A-Z0-9\'\"\?]', s): return False
- return True
-
t = []
for s in sys.stdin:
- if s[0] == '*': print s.strip()
- t.append(' '.join(list(takewhile(token, s.split()))))
+ s = re.sub(r'''['"].*''', '', s)
+ s = re.sub(r' (for|to|is|at|from|then).*', '', s)
+ s = re.sub(r' [0-9].*', '', s)
+ s = re.sub(r' c[0-9a-f\?]{2}.*', '', s)
+ s = re.sub(r'\s+$', '', s)
+ t.append(s)
for s, c in Counter(t).most_common():
print '%d\t"%s"' % (c, s)

0 comments on commit 0193a2c

Please sign in to comment.