setup of initial release

martinbenes1996 · Jun 18, 2020 · 51d0a70 · 51d0a70
1 parent dbb5745
commit 51d0a70
Show file tree

Hide file tree

Showing 7 changed files with 256 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+# Eurostat
+
+The program `eurostat.py` is a simple interface to parse Eurostat data.
+
+## Executing the modul
+
+Parsing data from Eurostat to a file is as easy as
+
+```bash
+python3 eurostat.py --output data.csv --start 2019-01-01 --verbose
+```
+
+It downloads the file from Eurostat and parses it according to the input to an output format.
+
+```
+sex,age,geo\time,2020W23,2020W22,2020W21, ... ,2019W03,2019W02,2019W01
+F,OTAL,AT,,,,                             ... ,852,877,914
+F,OTAL,AT1,,,                             ... ,364,361,387
+...
+```
+
+All parameters of the command can be shown with
+
+```bash
+python3 eurostat.py --help
+```
+
+```
+usage: eurostat.py [-h] [-o OUTPUT] [-n CHUNKSIZE] [-s START] [-v]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Directs the output to a name of your choice.
+  -n CHUNKSIZE, --chunksize CHUNKSIZE
+                        Number of lines in chunk (in thousands).
+  -s START, --start START
+                        Start date.
+  -v, --verbose         Sets verbose log (logging level INFO).
+```
+
+## Importing
+
+It can be imported as well. Following code is using the inner function `read_eurostat()` to load the data. The total size of the data frame is about 218 MB, so the call takes more than 15 minutes and the usage of memory is enormous.
+
+The module should not be used like this. Recommended is implementation using Big Data framework, e.g. PySpark.
+
+```python
+from datetime import datetime
+import eurostat
+
+data = eurostat.read_eurostat(output = None, start = datetime(2019,1,1))
+```
+
+Parameter `output = None` causes that the output is collected into a single dataframe and returned.
+
+## Credits
+
+Author: [Martin Benes](https://www.github.com/martinbenes1996).
diff --git a/eurostat_deaths/__init__.py b/eurostat_deaths/__init__.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+"""Webscraper for Eurostat.
+ 
+Archive URL: https://ec.europa.eu/eurostat
+Todo:
+    * caching
+"""
+
+import pkg_resources
+from .deaths import *
+from .populations import *
+
+try:
+    __version__ = pkg_resources.get_distribution("eurostat_deaths").version
+except:
+    __version__ = None
diff --git a/eurostat_deaths/deaths.py b/eurostat_deaths/deaths.py
@@ -0,0 +1,124 @@
+
+from datetime import datetime
+import gzip
+from io import BytesIO
+import logging
+import pandas as pd
+import requests
+import warnings
+
+# soft int parser
+def tryInt(i):
+    """Soft int parser. If not possible, bypasses input.
+    
+    Args:
+        i (any): Value to parse int from.
+    """
+    try: return int(i)
+    except: return i
+
+def deaths(start = None, output = "output.csv", chunksize = 1):
+    """Reads data from Eurostat, filters and saves to CSV.
+    
+    Args:
+        start (datetime, optional): Start time. Will be rounded to week. Endtime is always end of data.
+                                    Default is all the data (no filtering).
+        output (str, optional): Output file. If None, returns processed dataframe. Default is "output.csv".
+        chunksize (int, optional): Size of chunk to process data by (in thousands). Default is 1 (1000 lines in chunk).
+    """
+    # download zip
+    logging.warning("input has over 200MB, processing will take a few minutes (for me 15 min)")
+    logging.info("downloading zip file")
+    url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_mweek3.tsv.gz'
+    zipinput = requests.get(url, stream=True)
+
+    # unzip tsv
+    data = None
+    with gzip.GzipFile(fileobj = BytesIO(zipinput.content), mode = "r") as z:
+        logging.info("parsing zip file")
+
+        for i,chunk in enumerate(pd.read_csv(z, sep=",|\t", engine = "python", chunksize = chunksize * 10**3)):
+            # columns
+            chunk.columns = [c.strip() for c in chunk.columns]
+            data_columns = set(chunk.columns) - {'unit','sex','age','geo\\time'}
+
+            # parse data
+            chunk[ list(data_columns) ] = chunk[ list(data_columns) ]\
+                .replace({r'\s*:\s*': None, r'[^0-9]*([0-9]+)[^0-9]*': r'\1'}, regex = True)\
+                .apply(tryInt)
+            chunk = chunk\
+                .drop(['unit'], axis = 1)
+
+            # parse age groups
+            chunk['age'] = chunk['age']\
+                .replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90'})\
+                .replace({'-':'_'})\
+                .apply(lambda i: i[1:])
+
+            # filter weeks
+            if start is not None and start > datetime(2000,1,1):
+                year, week = start.year, start.isocalendar()[1]
+                cols_to_remove = [f"{y}W{str(w).zfill(2)}" for y in range(2000,year + 1) for w in range(1,54) if y < year or w < week]
+                for col in cols_to_remove:
+                    try:
+                        chunk = chunk\
+                            .drop(col, axis = 1)
+                    except: pass
+            # output
+            if output is not None:
+                if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
+                else: chunk.to_csv(output, mode='a', header=False, index=False)
+            else:
+                if data is None: data = data.append(chunk)
+                else: data = data.concat(chunk)
+
+            logging.info(f"parsed {chunksize * (i + 1) * 10**3}/64000 lines")
+
+
+def _parse_args():
+    """Parses arguments for direct module execution."""
+    # parse arguments
+    import argparse        
+    def check_positive(value):
+        ivalue = int(value)
+        if ivalue <= 0: raise argparse.ArgumentTypeError(f"{value} is an invalid positive int value")
+        return ivalue
+    def check_date(value):
+        # dates
+        try: return datetime.strptime(f"{value}-01-01", "%Y-%m-%d")
+        except: pass
+        try: return datetime.strptime(value, "%Y-%m-%d")
+        except: pass
+        # datetime
+        try: return datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
+        except: pass
+        # week of year
+        try: return datetime.strptime(f"{value}-1", "%Y-W%W-%w")
+        except: pass
+
+        raise argparse.ArgumentTypeError(f"{value} is an invalid date/week value")
+
+    # create argument records
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output", help="Directs the output to a name of your choice.")
+    parser.add_argument("-n", "--chunksize", type=check_positive, help="Number of lines in chunk (in thousands).")
+    parser.add_argument("-s", "--start", type=check_date, help="Start date.")
+    parser.add_argument("-v", "--verbose", action='count', default=0, help="Sets verbose log (logging level INFO).")
+    args = parser.parse_args()
+    # parse arguments
+    return {'--output': args.output if args.output else "output.csv",
+            '--chunksize': args.chunksize if args.chunksize else 1,
+            '--start': args.start if args.start else None,
+            '--verbose': bool(args.verbose) if args.verbose else False}
+
+if __name__ == "__main__":
+    # parse arguments
+    args = _parse_args()
+    # set verbose
+    if args['--verbose']:
+        logging.basicConfig(level = logging.INFO)
+
+    # call main function
+    deaths(start = args['--start'], output = args['--output'], chunksize = args['--chunksize'])
+
+__all__ = ["deaths"]
diff --git a/eurostat_deaths/populations.py b/eurostat_deaths/populations.py
@@ -0,0 +1,6 @@
+
+
+def populations(*args, **kwargs):
+    raise NotImplementedError
+
+__all__ = ["populations"]
diff --git a/publish.sh b/publish.sh
@@ -0,0 +1,7 @@
+
+# remove previous releases
+rm -rf build/ dist/ eurostat_deaths.egg-info/ __pycache__/
+# compile
+python setup.py sdist bdist_wheel
+# publish
+python -m twine upload dist/*
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -0,0 +1,44 @@
+
+# requirements
+try:
+  with open('requirements.txt') as f:
+    reqs = f.read().splitlines()
+except:
+  reqs = []
+
+import setuptools
+with open("README.md", "r", encoding="UTF-8") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+  name = 'eurostat_deaths',
+  version = '0.0.1',
+  author = 'Martin Beneš',
+  author_email = 'martinbenes1996@gmail.com',
+  description = 'Web Scraper for Eurostat data.',
+  long_description = long_description,
+  long_description_content_type="text/markdown",
+  packages=setuptools.find_packages(),
+  license='MIT',
+  url = 'https://github.com/martinbenes1996/eurostat_deaths',
+  download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.1.tar.gz',
+  keywords = ['eurostat', 'deaths', 'web', 'html', 'webscraping'],
+  install_requires = reqs,
+  package_dir={'': '.'},
+  classifiers=[
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Science/Research',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Other Audience',
+    'Environment :: Web Environment',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Information Analysis',
+    'Topic :: Software Development :: Libraries',
+    'Topic :: Utilities',
+    'License :: OSI Approved :: MIT License',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.4',
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+  ],
+)