Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 321 lines (292 sloc) 11.476 kB
99a5c0c @slinkp Adding copyrights and GPL v3 everywhere
slinkp authored
1 # Copyright 2007,2008,2009,2011 Everyblock LLC, OpenPlans, and contributors
2 #
3 # This file is part of ebdata
4 #
5 # ebdata is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # ebdata is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with ebdata. If not, see <http://www.gnu.org/licenses/>.
17 #
18
5c9826f initial import
Don Kukral authored
19 import re
20
21 # Regex notes:
22 # * This is *not* a case-insensitive regex, because we assume
23 # capitalized words are special (street names).
24 # * All data matched by capturing parentheses is concatenated together, so
25 # if you don't want to include something in the resulting string, don't
26 # capture it.
27
28 # STREET_NAME is a fragment of a regular expression that is used in several
29 # places in our "real" regular expression (ADDRESSES_RE) below. The one tricky
30 # thing about it is that it includes a "CAPTURE_START" placeholder instead of
31 # a capturing opening parenthesis. This lets us create two versions of the
32 # regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE.
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
33
34
5c9826f initial import
Don Kukral authored
35 STREET_NAME = r"""
36 # Here, we define some common false positives and tell the regex to ignore them.
37 (?!
38 [Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press
39 |
40 [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff] # university of
41 )
42 # DIRECTION
43 %(CAPTURE_START)s
44 (?:
45 [NSEWnsew]\.?
46 |
47 (?:
48 [Nn][Oo][Rr][Tt][Hh] |
49 [Ss][Oo][Uu][Tt][Hh] |
50 [Ee][Aa][Ss][Tt] |
51 [Ww][Ee][Ss][Tt] |
52 [Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] |
f1457ee @slinkp fix an address-parsing bug found by code inspection ... but the tests…
slinkp authored
53 [Nn][Oo][Rr][Tt][Hh][Ww][Ee][Ss][Tt] |
5c9826f initial import
Don Kukral authored
54 [Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] |
55 [Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt]
56 )
57 |
58 (?:
59 N\.?W | S\.?W | N\.?E | S\.?E
60 )\.?
61 )
62 \ + # space (but not newline)
63 )?
64 (?:
65 # STREET NAME
66 %(CAPTURE_START)s
67 # Numbered street names with a suffix ("3rd", "4th").
68 \d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D)
69
70 |
71
72 # Or, numbered street names without a suffix ("3", "4")
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
73 # but with a street type. (Suffix is captured later, so
74 # we use a lookahead here.)
5c9826f initial import
Don Kukral authored
75 \d+
76 (?=
77 \ +
78 (?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive|
79 Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte|
80 Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy
81 )
82 \b
83 )
84
85 |
86
87 # Or, street names that don't start with numbers.
88 (?:
89 # Optional prefixes --
90 # "St", as in "St Louis"
91 # "Dr. Martin", as in "Dr. Martin Luther King"
92 (?:
93 [Ss][Tt]\.?
94 |
95 [Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn]
96 )
97 \ +
98 )?
99 (?:
100 Mass\.(?=\ +[Aa]ve) # Special case: "Mass." abbr. for "Massachussetts Ave."
101 # Needs to be special-cased because of the period.
102 |
103 (?:Avenue|Ave\.?)\ +[A-Z] # Special case: "Avenue X"
104 |
105 [A-Z][a-z][A-Za-z]* # One initial-capped word
106 |
107 [A-Z]\b # Single-letter street name (e.g., K St. in DC)
108 (?!\.\w) # Avoid '20 U.S.A.'
109 )
110 )
111 (?:
112 # Here, we list the options with street suffixes first, so that
113 # the suffix abbreviations are treated as the last part of the
114 # street name, to avoid overeagerly capturing "123 Main St. The".
115 %(CAPTURE_START)s
116 \ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
117 |
118 \ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
119 |
120 (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
121 |
122 (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
123 |
124 (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
125 |
126 (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
127 |
128 (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5}
129 )?
130 # OPTIONAL POST-DIR
131 (?:
132 # Standard post-dir format
133 %(CAPTURE_START)s
134 ,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.?
135 )
136 # Avoid greedily capturing more letters, like
137 # '123 Main St, New England' to '123 Main St, N'
138 (?![A-Za-z])
139
140 |
141
142 # Or, a special-case for DC quadrants, to find stuff like:
143 # "600 H Street in NE Washington"
144 # "600 H Street in the NE quadrant"
145 # "600 H Street in northeast DC"
146
147 # Note that this is NOT captured, so that it's excluded from
148 # the final output.
149 ,?
150 \s in
151 %(CAPTURE_START)s
152 \s
153 )
154 (?:
155 (?:the|far) \s
156 )?
157
158 %(CAPTURE_START)s
159 (?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest)
160 (?=
161 \s (?:quadrant|D\.?C\.?|Washington)
162 )
163 )
164 )?
165 )?
166 )
167 """
168 STREET_NAME_CAPTURE = STREET_NAME % {'CAPTURE_START': '('}
169 STREET_NAME_NOCAPTURE = STREET_NAME % {'CAPTURE_START': '(?:'}
170
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
171 ADDRESSES_RE = r"""(?x)
5c9826f initial import
Don Kukral authored
172 (?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection.
173 \b
174
175 # Ignore things that look like dates -- e.g., "21 May 2009".
176 # This is a problem e.g. in cases where there's a May Street.
177 (?!
178 \d+\s+
179 (?:January|February|March|April|May|June|July|August|September|October|November|December)
180 ,?\s+
181 \d\d\d\d
182 )
183
184 # Ignore intersections that are prefixed by "University of", like
185 # "University of Texas at Austin". This is a common false positive.
186 (?<!
187 [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s
188 )
189
190 (?:
191 # SEGMENT ("FOO BETWEEN BAR AND BAZ")
192 (?:
193 %(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s
194 |
195 %(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s
196 )
197
198 |
199
200 # BLOCK/ADDRESS
201 (?:
202 (
203 (?:
204 (?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ]
205 (?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )?
206 [Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff]
207 |
208 \d+\ *-\ *\d+
209 |
210 \d+
211 )
212 \ +
213 )
214 %(STREET_NAME_CAPTURE)s
215
216 # ignore the intersection in parenthesis so that it's not picked
217 # up as a separate location. We do this by consuming the string
218 # but *not* capturing it.
219 (?:
220 \ +
221 \(?
222 between
223 \ +
224 %(STREET_NAME_NOCAPTURE)s
225 \ +
226 and
227 \ +
228 %(STREET_NAME_NOCAPTURE)s
229 \)?
230 )?
231 )
232
233 |
234
235 # INTERSECTION
236 (?:
237 # Common intersection prefixes. They're included here so that the
238 # regex doesn't include them as part of the street name.
239 (?:
240 (?:
241 [Nn]ear |
242 [Aa]t |
243 [Oo]n |
244 [Tt]o |
245 [Aa]round |
246 [Ii]ntersection\ of |
247 [Cc]orner\ of |
248 [Aa]rea\ of |
249 [Aa]reas?\ surrounding |
250 vicinity\ of |
251 ran\ down |
252 running\ down |
253 crossed
254 )
255 \ +
256 )?
257 \b
258 (?:%(STREET_NAME_CAPTURE)s)
259 (\ +)
260 (
261 (?:
262 [Aa][Nn][Dd] |
263 [Aa][Tt] |
264 [Nn][Ee][Aa][Rr] |
265 & |
266 [Aa][Rr][Oo][Uu][Nn][Dd] |
267 [Tt][Oo][Ww][Aa][Rr][Dd][Ss]? |
268 [Oo][Ff][Ff] |
269 (?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] |
270 (?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt]
271 )
272 \ +
273 )
274 (?:%(STREET_NAME_CAPTURE)s)
275 )
276 )
277
278 # OPTIONAL CITY SUFFIX
279 (?:
280 (?:
281 ,?\s+in |
282 ,
283 )
284 \s+
285
286 # CITY NAME
287 (
288 [A-Z][a-z][A-Za-z]* # One initial-capped word
289 (?:
290 ,?\ Jr\.?,?
291 |
292 \ [A-Z][a-z][A-Za-z]*
293 |
294 -[A-Za-z]+ # Hyphenated words (e.g. "Croton-on-Hudson" in NY)
295 ){0,4} # Initial-capped words
296 )
297 )?
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
298 """ % {'STREET_NAME_CAPTURE': STREET_NAME_CAPTURE, 'STREET_NAME_NOCAPTURE': STREET_NAME_NOCAPTURE}
299
300 ADDRESSES_RE_COMPILED = re.compile(ADDRESSES_RE)
5c9826f initial import
Don Kukral authored
301
302 def parse_addresses(text):
303 """
304 Returns a list of all addresses found in the given string, as tuples in the
305 format (address, city).
306 """
307 # This assumes the last parenthetical grouping in ADDRESSES_RE is the city.
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
308 return [(''.join(bits[:-1]), bits[-1]) for bits in ADDRESSES_RE_COMPILED.findall(text)]
5c9826f initial import
Don Kukral authored
309
310 def tag_addresses(text, pre='<addr>', post='</addr>'):
311 """
312 "Tags" any addresses in the given string by surrounding them with pre and post.
313 Returns the resulting string.
314
315 Note that only the addresses are tagged, not the cities (if cities exist).
316 """
317 def _re_handle_address(m):
318 bits = m.groups()
319 return pre + ''.join(filter(None, bits[:-1])) + (bits[-1] and (', %s' % bits[-1]) or '') + post
e7cee48 @slinkp comments on an intractable test failure, refs #17
slinkp authored
320 return ADDRESSES_RE_COMPILED.sub(_re_handle_address, text)
Something went wrong with that request. Please try again.