This repository has been archived by the owner on Jan 31, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 46
/
utils.py
87 lines (58 loc) · 1.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import urlparse
from fjord.feedback import config
TOKEN_SPLIT_RE = re.compile(r'[\s\.\,\/\\\?\;\:\"\*\&\^\%\$\#\@\!]+')
def tokenize(text):
"""Tokenizes the text
1. lowercases text
2. throws out all non-alpha-characters
3. nixes all stop words
"""
# Lowercase the text
text = text.lower()
# Nix all non-word characters
tokens = TOKEN_SPLIT_RE.split(text)
# Nix all stopwords and one-letter characters
tokens = [token for token in tokens
if (token not in config.ANALYSIS_STOPWORDS
and len(token) > 1)]
# Return whatever we have left
return tokens
def compute_grams(text):
"""Computes bigrams from analyzed text
:arg text: text to analyze and generate bigrams from
:returns: list of bigrams
>>> compute_grams(u'The quick brown fox jumped')
[u'quick brown', u'brown fox', u'fox jumped']
"""
if not text:
return []
tokens = tokenize(text)
# Generate set of bigrams. A bigram is a set of two consecutive
# tokens. We put them in a set because we don't want duplicates.
# We sort them so that "youtube crash" will match "crash youtube".
bigrams = set()
if len(tokens) >= 2:
for i in range(len(tokens) - 1):
bigrams.add(u' '.join(
sorted([tokens[i], tokens[i+1]])))
return list(bigrams)
def clean_url(url):
"""Takes a user-supplied url and cleans bits out
This removes:
1. nixes any non http/https/chrome/about urls
2. port numbers
3. query string variables
4. hashes
"""
if not url:
return url
# Don't mess with about: urls.
if url.startswith('about:'):
return url
parsed = urlparse.urlparse(url)
if parsed.scheme not in ('http', 'https', 'chrome'):
return u''
# Rebuild url to drop querystrings, hashes, etc
new_url = (parsed.scheme, parsed.hostname, parsed.path, None, None, None)
return urlparse.urlunparse(new_url)