In [1]:
# hide
# all_slow

In [2]:
# hide
# no_test
![ -e /content ] && pip install -Uqq pyarrow fastparquet
![ -e /content ] && pip install -Uqq geowrangler

# Datasets Download
> Basic introduction to downloading open geospatial data using geowrangler

In [3]:
%reload_ext autoreload
%autoreload 2

## Downloading Geofabrik Data

In [4]:
import geopandas as gpd

from geowrangler.datasets import geofabrik

### Listing down available regions

In [5]:
regions = geofabrik.list_geofabrik_regions()
# list down regions in asia
{k: v for k, v in regions.items() if "asia" in v}

{'afghanistan': 'https://download.geofabrik.de/asia/afghanistan-latest-free.shp.zip',
 'armenia': 'https://download.geofabrik.de/asia/armenia-latest-free.shp.zip',
 'azerbaijan': 'https://download.geofabrik.de/asia/azerbaijan-latest-free.shp.zip',
 'bangladesh': 'https://download.geofabrik.de/asia/bangladesh-latest-free.shp.zip',
 'bhutan': 'https://download.geofabrik.de/asia/bhutan-latest-free.shp.zip',
 'cambodia': 'https://download.geofabrik.de/asia/cambodia-latest-free.shp.zip',
 'central-zone': 'https://download.geofabrik.de/asia/india/central-zone-latest-free.shp.zip',
 'china': 'https://download.geofabrik.de/asia/china-latest-free.shp.zip',
 'chubu': 'https://download.geofabrik.de/asia/japan/chubu-latest-free.shp.zip',
 'chugoku': 'https://download.geofabrik.de/asia/japan/chugoku-latest-free.shp.zip',
 'east-timor': 'https://download.geofabrik.de/asia/east-timor-latest-free.shp.zip',
 'eastern-zone': 'https://download.geofabrik.de/asia/india/eastern-zone-latest-free.shp.zip',
 '

### Downloading a region file to a directory


In [6]:
# hide
# no_test
!rm -r ../test_dir
!mkdir -p ../test_dir

In [15]:
%%time
# no_test
downloaded_file = geofabrik.download_geofabrik_region("laos", "../test_dir")
downloaded_file

2023-02-01 11:05:38.099 | INFO     | geowrangler.datasets.utils:urlretrieve:40 - Retrieving https://download.geofabrik.de/asia/philippines-latest-free.shp.zip into ../test_dir/philippines-latest-free.shp.zip


CPU times: user 14.2 s, sys: 12.5 s, total: 26.7 s
Wall time: 2min 55s


Path('../test_dir/philippines-latest-free.shp.zip')

### Loading geofabrik files

In [8]:
# %%time
gdf = gpd.read_file(downloaded_file)
gdf.head()

You can also list the contents of the zipped shape file as well as load the shape file within it

In [20]:
!unzip -l {downloaded_file.as_posix()}

Archive:  ../test_dir/philippines-latest-free.shp.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
      654  2023-02-01 08:31   README
        6  2023-02-01 08:28   gis_osm_buildings_a_free_1.cpg
1739092439  2023-02-01 08:30   gis_osm_buildings_a_free_1.dbf
      144  2023-02-01 08:28   gis_osm_buildings_a_free_1.prj
1484415884  2023-02-01 08:30   gis_osm_buildings_a_free_1.shp
 84319724  2023-02-01 08:30   gis_osm_buildings_a_free_1.shx
        6  2023-02-01 08:28   gis_osm_landuse_a_free_1.cpg
 29884517  2023-02-01 08:30   gis_osm_landuse_a_free_1.dbf
      144  2023-02-01 08:28   gis_osm_landuse_a_free_1.prj
125154324  2023-02-01 08:30   gis_osm_landuse_a_free_1.shp
  1648892  2023-02-01 08:30   gis_osm_landuse_a_free_1.shx
        6  2023-02-01 08:28   gis_osm_natural_a_free_1.cpg
   382672  2023-02-01 08:30   gis_osm_natural_a_free_1.dbf
      144  2023-02-01 08:28   gis_osm_natural_a_free_1.prj
  1694392  2023-02-01 08:30   gis_osm_natural_a_free_1.shp
 

In [22]:
%%time
gdf2 = gpd.read_file(f"{downloaded_file.as_posix()}!gis_osm_pois_free_1.shp")

CPU times: user 4.05 s, sys: 0 ns, total: 4.05 s
Wall time: 4.04 s


### Unzipping and Caching OSM 

In addition downloading, the geofabrik module provides an unzipping and caching facility (default cache directory: `~/.cache/geowrangler/osm` ) to make it easier to access OSM data.

In [12]:
%%time
download_path = geofabrik.download_osm_region_data("afghanistan")
download_path

2023-02-01 11:01:55.933 | INFO     | geowrangler.datasets.geofabrik:download_osm_region_data:148 - OSM Data: Cached data available for afghanistan at /home/butchtm/.geowrangler/osm/afghanistan? True


CPU times: user 2.98 ms, sys: 1.27 ms, total: 4.25 ms
Wall time: 3.4 ms


'/home/butchtm/.geowrangler/osm/afghanistan'

Downloading it the second time around should be much faster as it
only checks if its in the cache and returns the path

In [13]:
%%time
download_path2 = geofabrik.download_osm_region_data("afghanistan")
download_path2

2023-02-01 11:01:55.976 | INFO     | geowrangler.datasets.geofabrik:download_osm_region_data:148 - OSM Data: Cached data available for afghanistan at /home/butchtm/.geowrangler/osm/afghanistan? True


CPU times: user 1.9 ms, sys: 816 µs, total: 2.71 ms
Wall time: 2.26 ms


'/home/butchtm/.geowrangler/osm/afghanistan'

In [14]:
!ls -ald {download_path2}/*

-rw-r--r-- 1 butchtm butchtm       654 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/README
-rw-r--r-- 1 butchtm butchtm         6 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_buildings_a_free_1.cpg
-rw-r--r-- 1 butchtm butchtm 272553794 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_buildings_a_free_1.dbf
-rw-r--r-- 1 butchtm butchtm       144 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_buildings_a_free_1.prj
-rw-r--r-- 1 butchtm butchtm 248453820 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_buildings_a_free_1.shp
-rw-r--r-- 1 butchtm butchtm  13214820 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_buildings_a_free_1.shx
-rw-r--r-- 1 butchtm butchtm         6 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_landuse_a_free_1.cpg
-rw-r--r-- 1 butchtm butchtm  17009387 Jan 31 13:18 /home/butchtm/.geowrangler/osm/afghanistan/gis_osm_landuse_a_free_1.dbf
-rw-r--r-- 1 butchtm

### Using the OSM Data Manager

We also provide an OSM Data Manager which, in addition to using the default cache (`~/.cache/geowrangler/osm`),
also caches the geodataframe for either the pois or the roads datasets from OSM in memory to avoid having
to reload the OSM data from disk.

In [None]:
# Create the osm data manager
odm = geofabrik.OsmDataManager()

In [None]:
pois_ph = odm.load_pois("philippines")

In [None]:
pois_ph.head()

In [None]:
roads_ph = odm.load_roads("philippines")

In [None]:
roads_ph.head()

## Loading OSM data from other years

In addition to providing access to the latest OSM shape files, we also provide an optional `year` parameter, which allows you to download OSM data from previous years. 

> Note: The availability of data from previous years is dependent on what geofabrik has made available. Please check the [Geofabrik download site](https://download.geofabrik.de/) for the list of available data 

In [None]:
pois_ph = odm.load_pois("philippines", year="2017")

In [None]:
pois_ph

## Downloading Ookla Data


In [None]:
import geopandas as gpd
import pandas as pd

from geowrangler.datasets import ookla

### Listing down available files

In [None]:
ookla_files = ookla.list_ookla_files()
ookla_files

### Downloading an ookla file to a directory
> Warning: Ookla files are >300MB and can reach ~550MB. Download with caution

In [None]:
!mkdir -p ../test_dir
downloaded_file = ookla.download_ookla_file(
    type_="fixed", year="2019", quarter="1", directory="../test_dir"
)
downloaded_file

### Loading ookla data

In [None]:
df = pd.read_parquet(downloaded_file)
df.head()