/
vet_skipwords.py
82 lines (61 loc) · 2.85 KB
/
vet_skipwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import csv
import logging
log = logging.getLogger("safecity.locate.vet_skip_words")
from optparse import make_option
import os
import re
from django.conf import settings
from django.core.management.base import NoArgsCommand, CommandError
from safecity.apps.locate.location_parser import strip_punctuation, WHITESPACE_REGEX
from safecity.apps.locate.models import *
from safecity.apps.tropo.views import KEYWORDS
class Command(NoArgsCommand):
title = 'locate.vet_skip_words'
help = 'Verify that no skip_words are road names or other keywords.'
option_list = NoArgsCommand.option_list + (
make_option('-r', '--rewrite', action='store_true', dest='rewrite',
help='Rewrite skipwords list without problem words.'),
)
def handle_noargs(self, **options):
skip_words = []
with open(os.path.join(settings.DATA_DIR, 'wordlists/skipwords')) as f:
skip_words = [word.strip() for word in f.readlines()]
road_types = []
with open(os.path.join(settings.DATA_DIR, 'streets/road_types.csv')) as f:
for line in f.readlines():
k, v = line.split(',')
road_types.append(k)
road_directions = []
with open(os.path.join(settings.DATA_DIR, 'streets/road_directions.csv')) as f:
for line in f.readlines():
k, v = line.split(',')
road_directions.append(k)
keywords = []
for v in KEYWORDS.values():
keywords.extend(v)
rewrite = skip_words
for word in skip_words:
if word in road_types:
log.error('Word "%s" is also a road type.' % word)
rewrite.remove(word)
continue
if word in road_directions:
log.error('Word "%s" is also a road direction.' % word)
rewrite.remove(word)
continue
if word in keywords:
log.error('Word "%s" is also a keyword.' % word)
rewrite.remove(word)
continue
like_aliases = RoadAlias.objects.filter(name__contains=word)
if like_aliases:
regex = re.compile('\\b%s\\b' % word)
for alias in like_aliases:
if regex.search(alias.name):
log.error('Word "%s" is also a part of road alias "%s".' % (word, alias.name))
rewrite.remove(word)
break
if options['rewrite']:
log.info('Rewriting skip_words list without problem words')
with open(os.path.join(settings.DATA_DIR, 'wordlists/skipwords'), 'w') as f:
f.write('\n'.join(rewrite))