This repository has been archived by the owner on Mar 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
dummy.py
63 lines (57 loc) · 2.35 KB
/
dummy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
def uniqify(seq, idfun=None): # Alex Martelli ******* order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
if seen.has_key(marker): continue
seen[marker] = 1
result.append(item)
return result
def iuniqify(seq):
""" return a list of strings unique case insensitively.
If the input is ['foo','bar','Foo']
return ['foo','bar']
"""
def idfunction(x):
if isinstance(x, basestring):
return x.lower()
else:
return x
return uniqify(seq, idfunction)
class H:
def _stripStopwords(self, qterm):
""" return improved search term and what words were removed """
#dumps = ('the','of','to','in','this','is','a','was')
# from http://www.textfixer.com/resources/common-english-words.txt
# which I got from wikipedia
dumps = "a able about across after all almost also am among an and "\
"any are as at be because been but by can cannot could dear "\
"did do does either else ever every for from get got had has "\
"have he her hers him his how however i if in into is it its "\
"just least let like likely may me might most must my "\
"neither no nor not of off often on only or other our own "\
"rather said say says she should since so some than that the "\
"their them then there these they this tis to too twas us "\
"wants was we were what when where which while who whom why "\
"will with would yet you your"
dumps = dumps.split()
stripped = []
qterm_splitted= qterm.split()
n_qterm = []
for term in qterm_splitted:
pure_term = re.sub('[^\w]', '', term.lower())
if pure_term in dumps:
stripped.append(term)
elif term.startswith('+') and term[1:].lower() in dumps:
n_qterm.append(term[1:])
else:
n_qterm.append(term)
n_qterm = iuniqify(n_qterm)
return ' '.join(n_qterm), stripped
h=H()
print h._stripStopwords('All I do think about you, think about')
print h._stripStopwords('ALL I DO THINK ABOUT YOU, THINK ABOUT')
print h._stripStopwords('i and cant stop thinking about you')