forked from ramusus/django-geonames
-
Notifications
You must be signed in to change notification settings - Fork 1
/
compress_geonames.py
134 lines (107 loc) · 5.83 KB
/
compress_geonames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import datetime, gzip, os, sys, zipfile
from optparse import make_option
from django.core.management.base import NoArgsCommand
from geonames import models
GEONAMES_DATA = os.path.abspath(os.path.join(os.path.dirname(models.__file__), 'data'))
GEONAMES_DATA_PC = os.path.join(GEONAMES_DATA, 'pc'))
class Command(NoArgsCommand):
option_list = NoArgsCommand.option_list + (
make_option('-t', '--time', action='store_true', dest='time', default=False,
help='Print the total time in running this command'),
make_option('-l', '--lengths', action='store_true', dest='lengths', default=False,
help='Print the lengths for each of the fields.'),
make_option('--no-countries', action='store_true', dest='no_countries', default=False,
help='Do not perform compression on allCountries.zip'),
make_option('--no-alternates', action='store_true', dest='no_alternates', default=False,
help='Do not perform compression on alternateNames.zip'),
)
clear_line = chr(27) + '[2K' + chr(27) +'[G'
def allCountries(self, **options):
zf = zipfile.ZipFile(os.path.join(GEONAMES_DATA, 'allCountries.zip'))
gzf = gzip.GzipFile(os.path.join(GEONAMES_DATA, 'allCountries.gz'), 'w')
in_fields = ['geonameid', 'name', 'asciiname', 'alternates', 'latitude', 'longitude',
'fclass', 'fcode', 'country_code', 'cc2',
'admin1', 'admin2', 'admin3', 'admin4',
'population', 'elevation', 'topo', 'timezone', 'mod_date']
out_fields = [f for f in in_fields if not f in ('latitude', 'longitude', 'asciiname')]
len_fields = ['name', 'asciiname', 'alternates', 'fclass', 'fcode', 'country_code',
'cc2', 'admin1', 'admin2', 'admin3', 'admin4', 'timezone']
if options['lengths']: lengths = dict([(f, 0) for f in len_fields])
contents = zf.read('allCountries.txt').split('\n')
num_lines = len(contents)
for i, line in enumerate(contents):
if line:
row = dict(zip(in_fields, map(str.strip, line.split('\t'))))
if options['lengths']:
for k in len_fields: lengths[k] = max(len(row[k]), lengths[k])
# fixing trailing slash problem in geonames data
try:
if row['name'][-1:] == "\\":
row['name'] = row['name'][0:-1]
except:
pass
try:
# Setting integers to 0 so they won't have to be NULL.
for key in ('population', 'elevation', 'topo'):
if not row[key]: row[key] = '0'
# Getting the EWKT for the point -- has to be EWKT or else
# the insertion of the point will raise a constraint error for
# for a non-matching ID.
wkt = 'SRID=4326;POINT(%s %s)' % (row['longitude'], row['latitude'])
except KeyError:
sys.stderr.write('Invalid row (line %d):\n' % i)
sys.stderr.write('%s\n' % str(row))
else:
new_line = '\t'.join([row[k] for k in out_fields])
new_line += '\t%s\n' % wkt
gzf.write(new_line)
if i % 10000 == 0:
sys.stdout.write(self.clear_line)
sys.stdout.write('Compressing allCountries.txt: %.2f%% (%d/%d)' % ( (100. * i) / num_lines, i, num_lines))
sys.stdout.flush()
gzf.close()
sys.stdout.write('\n')
if options['lengths']:
for fld in len_fields:
sys.stdout.write('%s:\t%d\n' % (fld, lengths[fld]))
def alternateNames(self, **options):
zf = zipfile.ZipFile(os.path.join(GEONAMES_DATA, 'alternateNames.zip'))
gzf = gzip.GzipFile(os.path.join(GEONAMES_DATA, 'alternateNames.gz'), 'w')
in_fields = ['alternateid', 'geoname_id', 'isolanguage', 'variant', 'preferred', 'short']
bool_fields = ['preferred', 'short']
len_fields = ['isolanguage', 'variant']
out_fields = in_fields
if options['lengths']: lengths = dict([(f, 0) for f in len_fields])
contents = zf.read('alternateNames.txt').split('\n')
num_lines = len(contents)
for i, line in enumerate(contents):
if line:
row = dict(zip(in_fields, map(str.strip, line.split('\t'))))
for bool_field in bool_fields:
if row[bool_field]:
row[bool_field] = '1'
else:
row[bool_field] = '0'
if options['lengths']:
for k in len_fields: lengths[k] = max(len(row[k]), lengths[k])
new_line = '\t'.join([row[k] for k in out_fields])
new_line += '\n'
gzf.write(new_line)
if i % 10000 == 0:
sys.stdout.write(self.clear_line)
sys.stdout.write('Compressing alternateNames.txt: %.2f%% (%d/%d)' % ( (100. * i) / num_lines, i, num_lines))
sys.stdout.flush()
gzf.close()
sys.stdout.write('\n')
if options['lengths']:
for fld in len_fields:
sys.stdout.write('%s:\t%d\n' % (fld, lengths[fld]))
def handle_noargs(self, **options):
if options['time']:
start_time = datetime.datetime.now()
if not options['no_countries']:
self.allCountries(**options)
if not options['no_alternates']:
self.alternateNames(**options)
if options['time']:
sys.stdout.write('\nCompleted in %s\n' % (datetime.datetime.now() - start_time))