In [1]:
import gzip
import os
import sys

import gdal
import geopandas
import numpy as np
import ogr
import pandas as pd
from shapely import wkb
from collections import defaultdict

from tqdm import tqdm

DATA_PATH = '../data'

In [5]:
%%time

def build_osm_stats(osm_file_name, limit=None):
    gdal.SetConfigOption('OGR_INTERLEAVED_READING', 'YES')
    osm = ogr.Open(osm_file_name)
    fields = defaultdict(lambda:defaultdict(int))
    layers={}
    print("Total layers: ",osm.GetLayerCount())
    for i in range(osm.GetLayerCount()):
        L=osm.GetLayer(i)
        L.ResetReading()
        print("Layer %i: %s"%(i+1,L.GetDescription()))
        cnt=0
        for feat in L:
            cnt += 1
            if cnt % 100000==0:
                print (cnt)
            for k,v in feat.items().items():
                if len(fields[k])>1000:
                    fields[k]['___others__']+=1
                else:
                    fields[k][v]+=1
            if limit is not None and cnt>=limit:
                break
        layers[i]={'index':i,'name':L.GetDescription(),'size':cnt}
    return layers, {k:{kk:vv for kk,vv in v.items()} for k,v in fields.items()}


try:
    with open(os.path.join(DATA_PATH,'osm_stats.pickle'),'rb') as f:
        layers, fields = pickle.load(f)
except:
    layers, fields = build_osm_stats(os.path.join(DATA_PATH,'russia-latest.osm.pbf'))
    with open(os.path.join(DATA_PATH,'osm_stats.pickle'),'wb') as f:
        pickle.dump((layers,fields),f)

Total layers:  5
Layer 1: points
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
Layer 2: lines
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
76

In [19]:
for k in sorted(fields):
    print("%10s  %i"%(k,len(fields[k])))
    for kk in list(fields[k].keys())[:20]:
        print("\t\t%s"%kk)
    if len(fields[k])>20:
        print("\t\t...")

   address  8
		None
		Saatse küla, Värska vald, 64037 Põlva maakond
		Pikk 40, Värska, 64001 Põlvamaa
		Peterburi mnt 2, 20308 Narva, Ida-Virumaa
		Мичуринский пр-т, д. 20, к. 1 (Олимпийская деревня)
		ул.Комсомольская 10
		Город Омск, пр. Карла Маркса, 41/1
		Молочница
admin_level  9
		None
		4
		6
		5
		8
		9
		7
		3
		10
 aerialway  16
		None
		chair_lift
		platter
		drag_lift
		t-bar
		cable_car
		construction
		gondola
		j-bar
		rope_tow
		goods
		zip_line
		magic_carpet
		mixed_lift
		yes
		no
   aeroway  2
		None
		aerodrome
   amenity  41
		None
		school
		hospital
		college
		library
		public_building
		university
		clinic
		townhall
		theatre
		police
		stadiu
		community_centre
		arts_centre
		embassy
		cafe
		dentist
		parking
		kindergarten
		prison
		...
   barrier  147
		None
		toll_booth
		lift_gate
		border_control
		gate
		block
		yes
		entrance
		bollard
		cycle_barrier
		cattle_grid
		height_restrictor
		chain
		border_crossing
		swing_gate
		sally_port
		hampshire