-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
dbb5745
commit 51d0a70
Showing
7 changed files
with
256 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Eurostat | ||
|
||
The program `eurostat.py` is a simple interface to parse Eurostat data. | ||
|
||
## Executing the modul | ||
|
||
Parsing data from Eurostat to a file is as easy as | ||
|
||
```bash | ||
python3 eurostat.py --output data.csv --start 2019-01-01 --verbose | ||
``` | ||
|
||
It downloads the file from Eurostat and parses it according to the input to an output format. | ||
|
||
``` | ||
sex,age,geo\time,2020W23,2020W22,2020W21, ... ,2019W03,2019W02,2019W01 | ||
F,OTAL,AT,,,, ... ,852,877,914 | ||
F,OTAL,AT1,,, ... ,364,361,387 | ||
... | ||
``` | ||
|
||
All parameters of the command can be shown with | ||
|
||
```bash | ||
python3 eurostat.py --help | ||
``` | ||
|
||
``` | ||
usage: eurostat.py [-h] [-o OUTPUT] [-n CHUNKSIZE] [-s START] [-v] | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
-o OUTPUT, --output OUTPUT | ||
Directs the output to a name of your choice. | ||
-n CHUNKSIZE, --chunksize CHUNKSIZE | ||
Number of lines in chunk (in thousands). | ||
-s START, --start START | ||
Start date. | ||
-v, --verbose Sets verbose log (logging level INFO). | ||
``` | ||
|
||
## Importing | ||
|
||
It can be imported as well. Following code is using the inner function `read_eurostat()` to load the data. The total size of the data frame is about 218 MB, so the call takes more than 15 minutes and the usage of memory is enormous. | ||
|
||
The module should not be used like this. Recommended is implementation using Big Data framework, e.g. PySpark. | ||
|
||
```python | ||
from datetime import datetime | ||
import eurostat | ||
|
||
data = eurostat.read_eurostat(output = None, start = datetime(2019,1,1)) | ||
``` | ||
|
||
Parameter `output = None` causes that the output is collected into a single dataframe and returned. | ||
|
||
## Credits | ||
|
||
Author: [Martin Benes](https://www.github.com/martinbenes1996). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Webscraper for Eurostat. | ||
Archive URL: https://ec.europa.eu/eurostat | ||
Todo: | ||
* caching | ||
""" | ||
|
||
import pkg_resources | ||
from .deaths import * | ||
from .populations import * | ||
|
||
try: | ||
__version__ = pkg_resources.get_distribution("eurostat_deaths").version | ||
except: | ||
__version__ = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
|
||
from datetime import datetime | ||
import gzip | ||
from io import BytesIO | ||
import logging | ||
import pandas as pd | ||
import requests | ||
import warnings | ||
|
||
# soft int parser | ||
def tryInt(i): | ||
"""Soft int parser. If not possible, bypasses input. | ||
Args: | ||
i (any): Value to parse int from. | ||
""" | ||
try: return int(i) | ||
except: return i | ||
|
||
def deaths(start = None, output = "output.csv", chunksize = 1): | ||
"""Reads data from Eurostat, filters and saves to CSV. | ||
Args: | ||
start (datetime, optional): Start time. Will be rounded to week. Endtime is always end of data. | ||
Default is all the data (no filtering). | ||
output (str, optional): Output file. If None, returns processed dataframe. Default is "output.csv". | ||
chunksize (int, optional): Size of chunk to process data by (in thousands). Default is 1 (1000 lines in chunk). | ||
""" | ||
# download zip | ||
logging.warning("input has over 200MB, processing will take a few minutes (for me 15 min)") | ||
logging.info("downloading zip file") | ||
url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_mweek3.tsv.gz' | ||
zipinput = requests.get(url, stream=True) | ||
|
||
# unzip tsv | ||
data = None | ||
with gzip.GzipFile(fileobj = BytesIO(zipinput.content), mode = "r") as z: | ||
logging.info("parsing zip file") | ||
|
||
for i,chunk in enumerate(pd.read_csv(z, sep=",|\t", engine = "python", chunksize = chunksize * 10**3)): | ||
# columns | ||
chunk.columns = [c.strip() for c in chunk.columns] | ||
data_columns = set(chunk.columns) - {'unit','sex','age','geo\\time'} | ||
|
||
# parse data | ||
chunk[ list(data_columns) ] = chunk[ list(data_columns) ]\ | ||
.replace({r'\s*:\s*': None, r'[^0-9]*([0-9]+)[^0-9]*': r'\1'}, regex = True)\ | ||
.apply(tryInt) | ||
chunk = chunk\ | ||
.drop(['unit'], axis = 1) | ||
|
||
# parse age groups | ||
chunk['age'] = chunk['age']\ | ||
.replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90'})\ | ||
.replace({'-':'_'})\ | ||
.apply(lambda i: i[1:]) | ||
|
||
# filter weeks | ||
if start is not None and start > datetime(2000,1,1): | ||
year, week = start.year, start.isocalendar()[1] | ||
cols_to_remove = [f"{y}W{str(w).zfill(2)}" for y in range(2000,year + 1) for w in range(1,54) if y < year or w < week] | ||
for col in cols_to_remove: | ||
try: | ||
chunk = chunk\ | ||
.drop(col, axis = 1) | ||
except: pass | ||
# output | ||
if output is not None: | ||
if i == 0: chunk.to_csv(output, mode='w', header=True, index=False) | ||
else: chunk.to_csv(output, mode='a', header=False, index=False) | ||
else: | ||
if data is None: data = data.append(chunk) | ||
else: data = data.concat(chunk) | ||
|
||
logging.info(f"parsed {chunksize * (i + 1) * 10**3}/64000 lines") | ||
|
||
|
||
def _parse_args(): | ||
"""Parses arguments for direct module execution.""" | ||
# parse arguments | ||
import argparse | ||
def check_positive(value): | ||
ivalue = int(value) | ||
if ivalue <= 0: raise argparse.ArgumentTypeError(f"{value} is an invalid positive int value") | ||
return ivalue | ||
def check_date(value): | ||
# dates | ||
try: return datetime.strptime(f"{value}-01-01", "%Y-%m-%d") | ||
except: pass | ||
try: return datetime.strptime(value, "%Y-%m-%d") | ||
except: pass | ||
# datetime | ||
try: return datetime.strptime(value, "%Y-%m-%d %H:%M:%S") | ||
except: pass | ||
# week of year | ||
try: return datetime.strptime(f"{value}-1", "%Y-W%W-%w") | ||
except: pass | ||
|
||
raise argparse.ArgumentTypeError(f"{value} is an invalid date/week value") | ||
|
||
# create argument records | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-o", "--output", help="Directs the output to a name of your choice.") | ||
parser.add_argument("-n", "--chunksize", type=check_positive, help="Number of lines in chunk (in thousands).") | ||
parser.add_argument("-s", "--start", type=check_date, help="Start date.") | ||
parser.add_argument("-v", "--verbose", action='count', default=0, help="Sets verbose log (logging level INFO).") | ||
args = parser.parse_args() | ||
# parse arguments | ||
return {'--output': args.output if args.output else "output.csv", | ||
'--chunksize': args.chunksize if args.chunksize else 1, | ||
'--start': args.start if args.start else None, | ||
'--verbose': bool(args.verbose) if args.verbose else False} | ||
|
||
if __name__ == "__main__": | ||
# parse arguments | ||
args = _parse_args() | ||
# set verbose | ||
if args['--verbose']: | ||
logging.basicConfig(level = logging.INFO) | ||
|
||
# call main function | ||
deaths(start = args['--start'], output = args['--output'], chunksize = args['--chunksize']) | ||
|
||
__all__ = ["deaths"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
|
||
def populations(*args, **kwargs): | ||
raise NotImplementedError | ||
|
||
__all__ = ["populations"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
# remove previous releases | ||
rm -rf build/ dist/ eurostat_deaths.egg-info/ __pycache__/ | ||
# compile | ||
python setup.py sdist bdist_wheel | ||
# publish | ||
python -m twine upload dist/* |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
|
||
# requirements | ||
try: | ||
with open('requirements.txt') as f: | ||
reqs = f.read().splitlines() | ||
except: | ||
reqs = [] | ||
|
||
import setuptools | ||
with open("README.md", "r", encoding="UTF-8") as fh: | ||
long_description = fh.read() | ||
|
||
setuptools.setup( | ||
name = 'eurostat_deaths', | ||
version = '0.0.1', | ||
author = 'Martin Beneš', | ||
author_email = 'martinbenes1996@gmail.com', | ||
description = 'Web Scraper for Eurostat data.', | ||
long_description = long_description, | ||
long_description_content_type="text/markdown", | ||
packages=setuptools.find_packages(), | ||
license='MIT', | ||
url = 'https://github.com/martinbenes1996/eurostat_deaths', | ||
download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.1.tar.gz', | ||
keywords = ['eurostat', 'deaths', 'web', 'html', 'webscraping'], | ||
install_requires = reqs, | ||
package_dir={'': '.'}, | ||
classifiers=[ | ||
'Development Status :: 3 - Alpha', | ||
'Intended Audience :: Science/Research', | ||
'Intended Audience :: Developers', | ||
'Intended Audience :: Other Audience', | ||
'Environment :: Web Environment', | ||
'Topic :: Scientific/Engineering', | ||
'Topic :: Scientific/Engineering :: Information Analysis', | ||
'Topic :: Software Development :: Libraries', | ||
'Topic :: Utilities', | ||
'License :: OSI Approved :: MIT License', | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.4', | ||
'Programming Language :: Python :: 3.5', | ||
'Programming Language :: Python :: 3.6', | ||
], | ||
) |