Skip to content

Commit

Permalink
setup of initial release
Browse files Browse the repository at this point in the history
  • Loading branch information
martinbenes1996 committed Jun 18, 2020
1 parent dbb5745 commit 51d0a70
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 0 deletions.
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Eurostat

The program `eurostat.py` is a simple interface to parse Eurostat data.

## Executing the modul

Parsing data from Eurostat to a file is as easy as

```bash
python3 eurostat.py --output data.csv --start 2019-01-01 --verbose
```

It downloads the file from Eurostat and parses it according to the input to an output format.

```
sex,age,geo\time,2020W23,2020W22,2020W21, ... ,2019W03,2019W02,2019W01
F,OTAL,AT,,,, ... ,852,877,914
F,OTAL,AT1,,, ... ,364,361,387
...
```

All parameters of the command can be shown with

```bash
python3 eurostat.py --help
```

```
usage: eurostat.py [-h] [-o OUTPUT] [-n CHUNKSIZE] [-s START] [-v]
optional arguments:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Directs the output to a name of your choice.
-n CHUNKSIZE, --chunksize CHUNKSIZE
Number of lines in chunk (in thousands).
-s START, --start START
Start date.
-v, --verbose Sets verbose log (logging level INFO).
```

## Importing

It can be imported as well. Following code is using the inner function `read_eurostat()` to load the data. The total size of the data frame is about 218 MB, so the call takes more than 15 minutes and the usage of memory is enormous.

The module should not be used like this. Recommended is implementation using Big Data framework, e.g. PySpark.

```python
from datetime import datetime
import eurostat

data = eurostat.read_eurostat(output = None, start = datetime(2019,1,1))
```

Parameter `output = None` causes that the output is collected into a single dataframe and returned.

## Credits

Author: [Martin Benes](https://www.github.com/martinbenes1996).
16 changes: 16 additions & 0 deletions eurostat_deaths/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
"""Webscraper for Eurostat.
Archive URL: https://ec.europa.eu/eurostat
Todo:
* caching
"""

import pkg_resources
from .deaths import *
from .populations import *

try:
__version__ = pkg_resources.get_distribution("eurostat_deaths").version
except:
__version__ = None
124 changes: 124 additions & 0 deletions eurostat_deaths/deaths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@

from datetime import datetime
import gzip
from io import BytesIO
import logging
import pandas as pd
import requests
import warnings

# soft int parser
def tryInt(i):
"""Soft int parser. If not possible, bypasses input.
Args:
i (any): Value to parse int from.
"""
try: return int(i)
except: return i

def deaths(start = None, output = "output.csv", chunksize = 1):
"""Reads data from Eurostat, filters and saves to CSV.
Args:
start (datetime, optional): Start time. Will be rounded to week. Endtime is always end of data.
Default is all the data (no filtering).
output (str, optional): Output file. If None, returns processed dataframe. Default is "output.csv".
chunksize (int, optional): Size of chunk to process data by (in thousands). Default is 1 (1000 lines in chunk).
"""
# download zip
logging.warning("input has over 200MB, processing will take a few minutes (for me 15 min)")
logging.info("downloading zip file")
url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_mweek3.tsv.gz'
zipinput = requests.get(url, stream=True)

# unzip tsv
data = None
with gzip.GzipFile(fileobj = BytesIO(zipinput.content), mode = "r") as z:
logging.info("parsing zip file")

for i,chunk in enumerate(pd.read_csv(z, sep=",|\t", engine = "python", chunksize = chunksize * 10**3)):
# columns
chunk.columns = [c.strip() for c in chunk.columns]
data_columns = set(chunk.columns) - {'unit','sex','age','geo\\time'}

# parse data
chunk[ list(data_columns) ] = chunk[ list(data_columns) ]\
.replace({r'\s*:\s*': None, r'[^0-9]*([0-9]+)[^0-9]*': r'\1'}, regex = True)\
.apply(tryInt)
chunk = chunk\
.drop(['unit'], axis = 1)

# parse age groups
chunk['age'] = chunk['age']\
.replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90'})\
.replace({'-':'_'})\
.apply(lambda i: i[1:])

# filter weeks
if start is not None and start > datetime(2000,1,1):
year, week = start.year, start.isocalendar()[1]
cols_to_remove = [f"{y}W{str(w).zfill(2)}" for y in range(2000,year + 1) for w in range(1,54) if y < year or w < week]
for col in cols_to_remove:
try:
chunk = chunk\
.drop(col, axis = 1)
except: pass
# output
if output is not None:
if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
else: chunk.to_csv(output, mode='a', header=False, index=False)
else:
if data is None: data = data.append(chunk)
else: data = data.concat(chunk)

logging.info(f"parsed {chunksize * (i + 1) * 10**3}/64000 lines")


def _parse_args():
"""Parses arguments for direct module execution."""
# parse arguments
import argparse
def check_positive(value):
ivalue = int(value)
if ivalue <= 0: raise argparse.ArgumentTypeError(f"{value} is an invalid positive int value")
return ivalue
def check_date(value):
# dates
try: return datetime.strptime(f"{value}-01-01", "%Y-%m-%d")
except: pass
try: return datetime.strptime(value, "%Y-%m-%d")
except: pass
# datetime
try: return datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
except: pass
# week of year
try: return datetime.strptime(f"{value}-1", "%Y-W%W-%w")
except: pass

raise argparse.ArgumentTypeError(f"{value} is an invalid date/week value")

# create argument records
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", help="Directs the output to a name of your choice.")
parser.add_argument("-n", "--chunksize", type=check_positive, help="Number of lines in chunk (in thousands).")
parser.add_argument("-s", "--start", type=check_date, help="Start date.")
parser.add_argument("-v", "--verbose", action='count', default=0, help="Sets verbose log (logging level INFO).")
args = parser.parse_args()
# parse arguments
return {'--output': args.output if args.output else "output.csv",
'--chunksize': args.chunksize if args.chunksize else 1,
'--start': args.start if args.start else None,
'--verbose': bool(args.verbose) if args.verbose else False}

if __name__ == "__main__":
# parse arguments
args = _parse_args()
# set verbose
if args['--verbose']:
logging.basicConfig(level = logging.INFO)

# call main function
deaths(start = args['--start'], output = args['--output'], chunksize = args['--chunksize'])

__all__ = ["deaths"]
6 changes: 6 additions & 0 deletions eurostat_deaths/populations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@


def populations(*args, **kwargs):
raise NotImplementedError

__all__ = ["populations"]
7 changes: 7 additions & 0 deletions publish.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

# remove previous releases
rm -rf build/ dist/ eurostat_deaths.egg-info/ __pycache__/
# compile
python setup.py sdist bdist_wheel
# publish
python -m twine upload dist/*
Empty file added requirements.txt
Empty file.
44 changes: 44 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

# requirements
try:
with open('requirements.txt') as f:
reqs = f.read().splitlines()
except:
reqs = []

import setuptools
with open("README.md", "r", encoding="UTF-8") as fh:
long_description = fh.read()

setuptools.setup(
name = 'eurostat_deaths',
version = '0.0.1',
author = 'Martin Beneš',
author_email = 'martinbenes1996@gmail.com',
description = 'Web Scraper for Eurostat data.',
long_description = long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
license='MIT',
url = 'https://github.com/martinbenes1996/eurostat_deaths',
download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.1.tar.gz',
keywords = ['eurostat', 'deaths', 'web', 'html', 'webscraping'],
install_requires = reqs,
package_dir={'': '.'},
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Science/Research',
'Intended Audience :: Developers',
'Intended Audience :: Other Audience',
'Environment :: Web Environment',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Software Development :: Libraries',
'Topic :: Utilities',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
)

0 comments on commit 51d0a70

Please sign in to comment.