In [1]:
from datetime import date, datetime, timedelta
from pytz import timezone
import pytz
utc = pytz.utc
# utc.zone
brussels = timezone('Europe/Brussels')
# brussels.zone
today = datetime.now(brussels).date()
# date.today()


In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [3]:
from datetime import datetime, date
import pandas as pd
import numpy as np

In [4]:
from app.utils import get_db_engine

In [5]:
from app.etl.pipeline import Pipeline
from app.etl.pipeline import Transformer
from app.models import models
from app.models.metadata import ETL_Metadata

In [15]:
pl = {
  "source": "https://statbel.fgov.be/sites/default/files/files/opendata/deathday/DEMO_DEATH_OPEN.zip",
  "model": "NumberOfDeathsByDistrictNISCode",
  "metadata_handler": {
    "frequency": "daily",
    "date_column": "date"
  },
  "tranforms": [
    {
      "type": "drop_columns",
      "data": {
        "columns": [
          "CD_PROV",
          "CD_REGIO",
          "NR_YEAR",
          "NR_WEEK"
        ]
      }
    },
    {
      "type": "rename_columns",
      "data": {
        "columns": {
          "CD_ARR": "nis_district",
          "CD_SEX": "sex",
          "CD_AGEGROUP": "agegroup",
          "DT_DATE": "date",
          "MS_NUM_DEATH": "number_of_deaths"
        }
      }
    },
    {
      "type": "update_value",
      "data": {
        "column": "date",
        "update": {
          "type": "date",
          "format": "%d/%m/%Y"
        }
      }
    },
    {
      "type": "group_by",
      "data": {
        "columns": [
          "date",
          "nis_district",
          "sex",
          "agegroup"
        ],
        "aggregate": {
          "type": "sum"
        }
      }
    },
    {
      "type": "update_value",
      "data": {
        "column": "nis_district",
        "type": "string",
        "update": {
          "type": "string",
          "format": "{0:0>5}"
        }
      }
    }
  ]
}

In [16]:
transformer=Transformer(pl["tranforms"])
pipeline = Pipeline(
    data_class=getattr(models, pl["model"]),
    path=pl["source"],
    transformer=transformer
)

In [17]:
data_frame = pipeline.extract()

In [18]:
data_frame

Unnamed: 0,CD_ARR,CD_PROV,CD_REGIO,CD_SEX,CD_AGEGROUP,DT_DATE,NR_YEAR,NR_WEEK,MS_NUM_DEATH
0,11000,10000,2000,1,45-64,1/1/2009,2009,2009-W01,3
1,11000,10000,2000,1,65-74,1/1/2009,2009,2009-W01,3
2,11000,10000,2000,1,75-84,1/1/2009,2009,2009-W01,6
3,11000,10000,2000,1,85+,1/1/2009,2009,2009-W01,5
4,11000,10000,2000,2,65-74,1/1/2009,2009,2009-W01,1
...,...,...,...,...,...,...,...,...,...
769385,21000,4000,4000,2,25-44,31/10/2021,2021,2021-W43,2
769386,21000,4000,4000,2,45-64,31/10/2021,2021,2021-W43,2
769387,21000,4000,4000,2,65-74,31/10/2021,2021,2021-W43,4
769388,21000,4000,4000,2,75-84,31/10/2021,2021,2021-W43,4


In [9]:
data_frame.describe()

Unnamed: 0,CD_ARR,CD_PROV,CD_REGIO,CD_SEX,NR_YEAR,MS_NUM_DEATH
count,769390.0,769390.0,769390.0,769390.0,769390.0,769390.0
mean,45619.111244,41731.346078,2492.861878,1.476951,2014.951742,1.822042
std,22528.129442,23609.477451,589.889828,0.499469,3.705712,1.389211
min,11000.0,4000.0,2000.0,1.0,2009.0,1.0
25%,25000.0,20002.0,2000.0,1.0,2012.0,1.0
50%,44000.0,40000.0,2000.0,1.0,2015.0,1.0
75%,62000.0,60000.0,3000.0,2.0,2018.0,2.0
max,93000.0,90000.0,4000.0,2.0,2021.0,33.0


In [19]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769390 entries, 0 to 769389
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CD_ARR        769390 non-null  int64 
 1   CD_PROV       769390 non-null  int64 
 2   CD_REGIO      769390 non-null  int64 
 3   CD_SEX        769390 non-null  int64 
 4   CD_AGEGROUP   769390 non-null  object
 5   DT_DATE       769390 non-null  object
 6   NR_YEAR       769390 non-null  int64 
 7   NR_WEEK       769390 non-null  object
 8   MS_NUM_DEATH  769390 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 52.8+ MB


In [11]:
data_frame.isnull().sum()

CD_ARR          0
CD_PROV         0
CD_REGIO        0
CD_SEX          0
CD_AGEGROUP     0
DT_DATE         0
NR_YEAR         0
NR_WEEK         0
MS_NUM_DEATH    0
dtype: int64

In [20]:
df = pipeline.transform(data_frame)

In [22]:
data_frame

Unnamed: 0,nis_district,sex,agegroup,date,number_of_deaths
0,11000,1,45-64,2009-01-01,3
1,11000,1,65-74,2009-01-01,3
2,11000,1,75-84,2009-01-01,6
3,11000,1,85+,2009-01-01,5
4,11000,2,65-74,2009-01-01,1
...,...,...,...,...,...
769385,21000,2,25-44,2021-10-31,2
769386,21000,2,45-64,2021-10-31,2
769387,21000,2,65-74,2021-10-31,4
769388,21000,2,75-84,2021-10-31,4


In [21]:

data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769390 entries, 0 to 769389
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   nis_district      769390 non-null  int64 
 1   sex               769390 non-null  int64 
 2   agegroup          769390 non-null  object
 3   date              769390 non-null  object
 4   number_of_deaths  769390 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 29.4+ MB


In [16]:
list = [
    pipeline.data_class(**kwargs) for kwargs in data_frame.to_dict(orient="records")
]

TypeError: object of type 'int' has no len()

In [None]:
list

In [None]:
data_frame = pipeline.handle_metadata(data_frame)

In [None]:
data_frame

In [None]:
data_frame = data_frame.groupby([
    'year', 'nis', 'sex',
    'nationality_code', 'nationality_text_nl', 'nationality_text_fr',
    'marital_status_code',
    'marital_status_text_nl',
    'marital_status_text_fr',
    'age'
]).sum().reset_index()
data_frame

In [None]:
data_frame.groupby(['year', 'nis', 'sex', 'nationality_code', 'marital_status_code', 'age'])['population'].transform('size')

In [None]:

data_frame[data_frame.groupby(['year', 'nis', 'sex', 'nationality_code', 'marital_status_code', 'age'])['nis'].transform('size') > 1]


In [None]:
data_frame = data_frame.groupby([
    'year', 'week', 'nis_district',
    'sex', 'agegroup', 'date'
]).sum().reset_index()
data_frame

In [None]:
data_frame.dropna(inplace=True)

In [None]:
data_frame.isnull().sum()

In [None]:
test = 2021
type(test)

In [None]:
starting_day_of_current_year = datetime.now().date().replace(month=1, day=1)    
ending_day_of_current_year = datetime.now().date().replace(month=12, day=31)
starting_day_of_current_year

In [None]:
frequency = "daily"
frequency is not "daily"

In [5]:
test = "Hello World"
if 'llo W'.lower() in test.lower():
    print('FOUND')

FOUND


In [4]:
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    d3 = d2 + time
    return abs((d2 - d1).days)

days_between("2021-1-1", "2021-1-31")

30