In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [26]:
import pandas as pd

In [35]:
from app.etl.pipeline import Pipeline
from app.models.models import CovidVaccinationByCategory

In [44]:
from sqlalchemy.orm import declarative_base
from sqlalchemy.sql.schema import Column
from sqlalchemy.sql.sqltypes import Date
from sqlalchemy.sql.sqltypes import Integer
from sqlalchemy.sql.sqltypes import String

Base = declarative_base()

In [45]:
class CovidVaccinationByCategory(Base):
    __tablename__ = "covid_vaccinations_by_category"
    id = Column(Integer, primary_key=True, nullable=False)
    date = Column(Date, nullable=False)
    region = Column(String, nullable=False)
    agegroup = Column(String, nullable=False)
    sex = Column(String, nullable=False)
    brand = Column(String, nullable=False)
    dose = Column(String, nullable=False)
    count = Column(Integer, nullable=False)

    def __repr__(self):
        return """
            <BelgiumVacinationByCategory(date='%s', region='%s', agegroup='%s')>
            """ % (
            self.date,
            self.region,
            self.agegroup,
        )


In [46]:
class Transformer():
    def __init__(
        self,
        column_renamer: dict = None,
        na_remover: bool = True,
        drop_columns: list = None
    ):
        self.column_renamer = column_renamer
        self.na_remover = na_remover
        self.drop_columns = drop_columns
        
    def transform(self, data_frame: pd.DataFrame):
        if self.drop_columns:
            data_frame.drop(self.drop_columns, axis=1, inplace=True)
        if self.column_renamer:
            data_frame.rename(columns=self.column_renamer, inplace=True)
        if self.na_remover:
            data_frame.dropna(inplace=True)

        return data_frame


In [47]:
transformer = Transformer(
    column_renamer= {
        "DATE": "date",
        "REGION": "region",
        "AGEGROUP": "agegroup",
        "SEX": "sex",
        "BRAND": "brand",
        "DOSE": "dose",
        "COUNT": "count",
    }
)

In [48]:
pipeline = Pipeline(
    CovidVaccinationByCategory,
    path="https://epistat.sciensano.be/Data/COVID19BE_VACC.csv",
    transformer=transformer,
)

In [49]:
df_extracted = pipeline.extract()
df = df_extracted.copy(deep=True)

In [52]:
df_extracted.head()

Unnamed: 0,DATE,REGION,AGEGROUP,SEX,BRAND,DOSE,COUNT
0,2020-12-28,Brussels,25-34,F,Pfizer-BioNTech,A,1
1,2020-12-28,Brussels,45-54,F,Pfizer-BioNTech,A,2
2,2020-12-28,Brussels,55-64,F,Pfizer-BioNTech,A,3
3,2020-12-28,Brussels,55-64,M,Pfizer-BioNTech,A,1
4,2020-12-28,Brussels,65-74,F,Pfizer-BioNTech,A,2


In [41]:
df_extracted.describe()

Unnamed: 0,COUNT
count,100628.0
mean,171.647384
std,933.715033
min,1.0
25%,2.0
50%,9.0
75%,57.0
max,37524.0


In [55]:
df_extracted.dtypes

DATE        object
REGION      object
AGEGROUP    object
SEX         object
BRAND       object
DOSE        object
COUNT        int64
dtype: object

In [56]:
df_extracted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100628 entries, 0 to 100627
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   DATE      100628 non-null  object
 1   REGION    82763 non-null   object
 2   AGEGROUP  100628 non-null  object
 3   SEX       100045 non-null  object
 4   BRAND     100628 non-null  object
 5   DOSE      100628 non-null  object
 6   COUNT     100628 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 5.4+ MB


In [59]:
df["DATE_CONVERTED"] = pd.to_datetime(df['DATE'])

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100628 entries, 0 to 100627
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   DATE            100628 non-null  object        
 1   REGION          82763 non-null   object        
 2   AGEGROUP        100628 non-null  object        
 3   SEX             100045 non-null  object        
 4   BRAND           100628 non-null  object        
 5   DOSE            100628 non-null  object        
 6   COUNT           100628 non-null  int64         
 7   DATE_CONVERTED  100628 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 6.1+ MB


In [62]:
df.columns

Index(['DATE', 'REGION', 'AGEGROUP', 'SEX', 'BRAND', 'DOSE', 'COUNT',
       'DATE_CONVERTED'],
      dtype='object')

In [66]:
df['REGION'].unique()

array(['Brussels', 'Flanders', nan, 'Wallonia', 'Ostbelgien'],
      dtype=object)

In [65]:
df['AGEGROUP'].unique()

array(['25-34', '45-54', '55-64', '65-74', '75-84', '85+', '35-44',
       '18-24', '16-17', '00-11', '12-15'], dtype=object)

In [67]:
df['SEX'].unique()

array(['F', 'M', nan], dtype=object)

In [68]:
df['BRAND'].unique()

array(['Pfizer-BioNTech', 'Moderna', 'Johnson&Johnson', 'Other',
       'AstraZeneca-Oxford'], dtype=object)

In [69]:
df['DOSE'].unique()

array(['A', 'C', 'B', 'E'], dtype=object)

In [17]:
df_transformed = pipeline.transform(df)

date        object
region      object
agegroup    object
sex         object
brand       object
dose        object
count        int64
dtype: object


In [18]:
df_transformed.head()

Unnamed: 0,date,region,agegroup,sex,brand,dose,count
0,2020-12-28,Brussels,25-34,F,Pfizer-BioNTech,A,1
1,2020-12-28,Brussels,45-54,F,Pfizer-BioNTech,A,2
2,2020-12-28,Brussels,55-64,F,Pfizer-BioNTech,A,3
3,2020-12-28,Brussels,55-64,M,Pfizer-BioNTech,A,1
4,2020-12-28,Brussels,65-74,F,Pfizer-BioNTech,A,2


In [19]:
df_extracted.head()

Unnamed: 0,DATE,REGION,AGEGROUP,SEX,BRAND,DOSE,COUNT
0,2020-12-28,Brussels,25-34,F,Pfizer-BioNTech,A,1
1,2020-12-28,Brussels,45-54,F,Pfizer-BioNTech,A,2
2,2020-12-28,Brussels,55-64,F,Pfizer-BioNTech,A,3
3,2020-12-28,Brussels,55-64,M,Pfizer-BioNTech,A,1
4,2020-12-28,Brussels,65-74,F,Pfizer-BioNTech,A,2


In [22]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
import openpyxl
import sqlalchemy

from sqlalchemy.orm import declarative_base
from sqlalchemy.sql.schema import Column
from sqlalchemy.sql.sqltypes import String, Integer, Date

from settings import BASE_DIR
print(BASE_DIR)

from app.etl.pipeline import Pipeline
print(Pipeline)

pipeline = Pipeline

# import os
# print(os.getcwd())
# from pathlib import Path

# BASE_DIR = Path(os.getcwd()).resolve(strict=True).parent
# from BASE_DIR import settings
# print(BASE_DIR)

os.path
<module 'posixpath' from '/Users/wimsuenens/.pyenv/versions/3.8.12/lib/python3.8/posixpath.py'>
os.path
/Users/wimsuenens/Projects/HoGent/DataEngineeringProjectII
['/Users/wimsuenens/Projects/HoGent/DataEngineeringProjectII/notebooks', '/Users/wimsuenens/.pyenv/versions/3.8.12/lib/python38.zip', '/Users/wimsuenens/.pyenv/versions/3.8.12/lib/python3.8', '/Users/wimsuenens/.pyenv/versions/3.8.12/lib/python3.8/lib-dynload', '', '/Users/wimsuenens/.local/share/virtualenvs/DataEngineeringProjectII-bdwPgCe6/lib/python3.8/site-packages', '/Users/wimsuenens/.local/share/virtualenvs/DataEngineeringProjectII-bdwPgCe6/lib/python3.8/site-packages/IPython/extensions', '/Users/wimsuenens/.ipython', '/Users/wimsuenens/Projects/HoGent/DataEngineeringProjectII']
/Users/wimsuenens/Projects/HoGent/DataEngineeringProjectII
<class 'app.etl.pipeline.Pipeline'>


ETL - Pipeline | Extract - Transform - Load

In [9]:
Base = declarative_base()

In [10]:
class CovidVacinationByCategory(Base):
    __tablename__ = 'covid_vaccinations_by_category'
    id = Column(Integer, primary_key=True, nullable=False)
    date = Column(Date)
    region = Column(String)
    agegroup = Column(String)
    sex = Column(String)
    brand = Column(String)
    dose = Column(String)
    count = Column(Integer)
    
    def __repr__(self):
        return """
            <BelgiumVacinationByCategory(date='%s', region='%s', agegroup='%s')>
            """ % (self.date, self.region, self.agegroup)

In [2]:
data_frame = pd.read_csv("https://epistat.sciensano.be/Data/COVID19BE_VACC.csv")

In [16]:
column_rename_dict = {
    "DATE": "date",
    "REGION": "region",
    "AGEGROUP": "agegroup",
    "SEX": "sex",
    "BRAND": "brand",
    "DOSE": "dose",
    "COUNT": "count",
}
data_frame = data_frame.rename(columns=column_rename_dict)
data_frame

Unnamed: 0,date,region,agegroup,sex,brand,dose,count
0,2020-12-28,Brussels,25-34,F,Pfizer-BioNTech,A,1
1,2020-12-28,Brussels,45-54,F,Pfizer-BioNTech,A,2
2,2020-12-28,Brussels,55-64,F,Pfizer-BioNTech,A,3
3,2020-12-28,Brussels,55-64,M,Pfizer-BioNTech,A,1
4,2020-12-28,Brussels,65-74,F,Pfizer-BioNTech,A,2
...,...,...,...,...,...,...,...
94202,2021-10-07,Wallonia,75-84,M,Pfizer-BioNTech,E,67
94203,2021-10-07,Wallonia,85+,F,Johnson&Johnson,C,1
94204,2021-10-07,Wallonia,85+,F,Pfizer-BioNTech,A,1
94205,2021-10-07,Wallonia,85+,F,Pfizer-BioNTech,E,234


In [14]:
list = [CovidVacinationByCategory(**kwargs) for kwargs in data_frame.to_dict(orient='records')]