Skip to content

Commit

Permalink
eurostat populations
Browse files Browse the repository at this point in the history
  • Loading branch information
martinbenes1996 committed Jun 19, 2020
1 parent b6607dd commit 05ac40b
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 12 deletions.
12 changes: 11 additions & 1 deletion README.md
Expand Up @@ -56,7 +56,17 @@ One additional setting is `chunksize` to set the size of chunk, that is processe

## Population

**TODO**
Populations in years for NUTS-2 and NUTS-3 regions can be fetched such as

```python
import eurostat_deaths as eurostat

data = eurostat.populations()
```

Similarly as in `deaths()` call, `populations()` can be parametrized with `chunksize` (in thousands of lines) and `output`, forwarding the output to file rather than returning and hence saving time allocating a big data frame in main memory.

Here the data volume is incomparably lower and hence the regular usage to return the data frame is possible.

## Credits

Expand Down
11 changes: 5 additions & 6 deletions eurostat_deaths/deaths.py
Expand Up @@ -51,10 +51,8 @@ def deaths(start = None, output = None, chunksize = 1):

# parse age groups
chunk['age'] = chunk['age']\
.replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90'})\
.replace({'-':'_'})\
.apply(lambda i: i[1:])

.replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90', 'Y_GE85': 'Y85'})\
.replace({r'(.*)-(.*)':r'\1_\2', r'Y(.*)':r'\1'}, regex = True)
# filter weeks
if start is not None and start > datetime(2000,1,1):
year, week = start.year, start.isocalendar()[1]
Expand All @@ -69,11 +67,12 @@ def deaths(start = None, output = None, chunksize = 1):
if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
else: chunk.to_csv(output, mode='a', header=False, index=False)
else:
if data is None: data = data.append(chunk)
if data is None: data = chunk
else: data = data.concat(chunk)

logging.info(f"parsed {chunksize * (i + 1) * 10**3}/64000 lines")


return data

def _parse_args():
"""Parses arguments for direct module execution."""
Expand Down
62 changes: 59 additions & 3 deletions eurostat_deaths/populations.py
@@ -1,6 +1,62 @@

import gzip
from io import BytesIO
import logging

def populations(*args, **kwargs):
raise NotImplementedError
import pandas as pd
import requests

__all__ = ["populations"]
# soft int parser
def tryInt(i):
"""Soft int parser. If not possible, bypasses input.
Args:
i (any): Value to parse int from.
"""
try: return int(i)
except: return i

def populations(output = None, chunksize = 10):
# download zip
url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_pjangrp3.tsv.gz'
zipinput = requests.get(url, stream = True)

# unzip tsv
data = None
with gzip.GzipFile(fileobj = BytesIO(zipinput.content), mode = "r") as z:
logging.info("parsing zip file")

for i,chunk in enumerate(pd.read_csv(z, sep=",|\t", engine = "python", chunksize = chunksize * 10**3)):
# columns
chunk.columns = [c.strip() for c in chunk.columns]
data_columns = set(chunk.columns) - {'unit','sex','age','geo\\time'}

# parse data
chunk[ list(data_columns) ] = chunk[ list(data_columns) ]\
.replace({r'\s*:\s*': None, r'[^0-9]*([0-9]+)[^0-9]*': r'\1'}, regex = True)\
.apply(tryInt)
chunk = chunk\
.drop(['unit'], axis = 1)

# parse age groups
chunk['age'] = chunk['age']\
.replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90', 'Y_GE85': 'Y85'})\
.replace({r'(.*)-(.*)':r'\1_\2', r'Y(.*)':r'\1'}, regex = True)
# output
if output is not None:
if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
else: chunk.to_csv(output, mode='a', header=False, index=False)
else:
if data is None: data = chunk
else: data = data.append(chunk)

logging.info(f"parsed {chunksize*i*10**3 + min(chunksize*10**3, chunk.shape[0])}/131880 lines")

return data

__all__ = ["populations"]

if __name__ == "__main__":
logging.basicConfig(level = logging.INFO)
data = populations()
print(data.age.unique())
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -12,7 +12,7 @@

setuptools.setup(
name = 'eurostat_deaths',
version = '0.0.3',
version = '0.0.4',
author = 'Martin Beneš',
author_email = 'martinbenes1996@gmail.com',
description = 'Web Scraper for Eurostat data.',
Expand All @@ -21,7 +21,7 @@
packages=setuptools.find_packages(),
license='MIT',
url = 'https://github.com/martinbenes1996/eurostat_deaths',
download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.3.tar.gz',
download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.4.tar.gz',
keywords = ['eurostat', 'deaths', 'web', 'html', 'webscraping'],
install_requires = reqs,
package_dir={'': '.'},
Expand Down

0 comments on commit 05ac40b

Please sign in to comment.