eurostat populations

martinbenes1996 · Jun 19, 2020 · 05ac40b · 05ac40b
1 parent b6607dd
commit 05ac40b
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,17 @@ One additional setting is `chunksize` to set the size of chunk, that is processe
 
 ## Population
 
-**TODO**
+Populations in years for NUTS-2 and NUTS-3 regions can be fetched such as
+
+```python
+import eurostat_deaths as eurostat
+
+data = eurostat.populations()
+```
+
+Similarly as in `deaths()` call, `populations()` can be parametrized with `chunksize` (in thousands of lines) and `output`, forwarding the output to file rather than returning and hence saving time allocating a big data frame in main memory.
+
+Here the data volume is incomparably lower and hence the regular usage to return the data frame is possible.
 
 ## Credits
 

diff --git a/eurostat_deaths/deaths.py b/eurostat_deaths/deaths.py
@@ -51,10 +51,8 @@ def deaths(start = None, output = None, chunksize = 1):
 
             # parse age groups
             chunk['age'] = chunk['age']\
-                .replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90'})\
-                .replace({'-':'_'})\
-                .apply(lambda i: i[1:])
-
+                .replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90', 'Y_GE85': 'Y85'})\
+                .replace({r'(.*)-(.*)':r'\1_\2', r'Y(.*)':r'\1'}, regex = True)
             # filter weeks
             if start is not None and start > datetime(2000,1,1):
                 year, week = start.year, start.isocalendar()[1]
@@ -69,11 +67,12 @@ def deaths(start = None, output = None, chunksize = 1):
                 if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
                 else: chunk.to_csv(output, mode='a', header=False, index=False)
             else:
-                if data is None: data = data.append(chunk)
+                if data is None: data = chunk
                 else: data = data.concat(chunk)
 
             logging.info(f"parsed {chunksize * (i + 1) * 10**3}/64000 lines")
-
+
+    return data
 
 def _parse_args():
     """Parses arguments for direct module execution."""

diff --git a/eurostat_deaths/populations.py b/eurostat_deaths/populations.py
@@ -1,6 +1,62 @@
 
+import gzip
+from io import BytesIO
+import logging
 
-def populations(*args, **kwargs):
-    raise NotImplementedError
+import pandas as pd
+import requests
 
-__all__ = ["populations"]
+# soft int parser
+def tryInt(i):
+    """Soft int parser. If not possible, bypasses input.
+    
+    Args:
+        i (any): Value to parse int from.
+    """
+    try: return int(i)
+    except: return i
+
+def populations(output = None, chunksize = 10):
+    # download zip
+    url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_pjangrp3.tsv.gz'
+    zipinput = requests.get(url, stream = True)
+
+    # unzip tsv
+    data = None
+    with gzip.GzipFile(fileobj = BytesIO(zipinput.content), mode = "r") as z:
+        logging.info("parsing zip file")
+
+        for i,chunk in enumerate(pd.read_csv(z, sep=",|\t", engine = "python", chunksize = chunksize * 10**3)):
+            # columns
+            chunk.columns = [c.strip() for c in chunk.columns]
+            data_columns = set(chunk.columns) - {'unit','sex','age','geo\\time'}
+
+            # parse data
+            chunk[ list(data_columns) ] = chunk[ list(data_columns) ]\
+                .replace({r'\s*:\s*': None, r'[^0-9]*([0-9]+)[^0-9]*': r'\1'}, regex = True)\
+                .apply(tryInt)
+            chunk = chunk\
+                .drop(['unit'], axis = 1)
+
+            # parse age groups
+            chunk['age'] = chunk['age']\
+                .replace({'Y_LT5': 'Y0-4', 'Y_GE90': 'Y90', 'Y_GE85': 'Y85'})\
+                .replace({r'(.*)-(.*)':r'\1_\2', r'Y(.*)':r'\1'}, regex = True)
+            # output
+            if output is not None:
+                if i == 0: chunk.to_csv(output, mode='w', header=True, index=False)
+                else: chunk.to_csv(output, mode='a', header=False, index=False)
+            else:
+                if data is None: data = chunk
+                else: data = data.append(chunk)
+
+            logging.info(f"parsed {chunksize*i*10**3 + min(chunksize*10**3, chunk.shape[0])}/131880 lines")
+
+    return data
+
+__all__ = ["populations"]
+
+if __name__ == "__main__":
+    logging.basicConfig(level = logging.INFO)
+    data = populations()
+    print(data.age.unique())
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setuptools.setup(
   name = 'eurostat_deaths',
-  version = '0.0.3',
+  version = '0.0.4',
   author = 'Martin Beneš',
   author_email = 'martinbenes1996@gmail.com',
   description = 'Web Scraper for Eurostat data.',
@@ -21,7 +21,7 @@
   packages=setuptools.find_packages(),
   license='MIT',
   url = 'https://github.com/martinbenes1996/eurostat_deaths',
-  download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.3.tar.gz',
+  download_url = 'https://github.com/martinbenes1996/eurostat_deaths/archive/0.0.4.tar.gz',
   keywords = ['eurostat', 'deaths', 'web', 'html', 'webscraping'],
   install_requires = reqs,
   package_dir={'': '.'},