In [1]:
import pandas as pd
import re
import numpy as np

In [27]:
from enum import Enum, auto
class Region(Enum):
    AL = auto() 
    AM = auto() 
    AT = auto() 
    AZ = auto() 
    BE = auto() 
    BG = auto() 
    BY = auto() 
    CH = auto() 
    CY = auto() 
    CZ = auto() 
    DE = auto() 
    DE_TOT = auto() 
    DK = auto() 
    EA18 = auto() 
    EA19 = auto() 
    EE = auto() 
    EEA30_2007 = auto() 
    EEA31 = auto() 
    EFTA = auto() 
    EL = auto() 
    ES = auto() 
    EU27_2007 = auto() 
    EU27_2020 = auto() 
    EU28 = auto() 
    FI = auto() 
    FR = auto() 
    FX = auto() 
    GE = auto() 
    HR = auto() 
    HU = auto() 
    IE = auto() 
    IS = auto() 
    IT = auto() 
    LI = auto() 
    LT = auto() 
    LU = auto() 
    LV = auto() 
    MD = auto() 
    ME = auto() 
    MK = auto() 
    MT = auto() 
    NL = auto() 
    NO = auto() 
    PL = auto() 
    PT = auto() 
    RO = auto() 
    RS = auto() 
    RU = auto() 
    SE = auto() 
    SI = auto() 
    SK = auto() 
    SM = auto() 
    TR = auto() 
    UA = auto() 
    UK = auto() 
    XK = auto()

    @classmethod
    def get_country_regions(cls):
        not_country_regions = [
            'DE_TOT',
            'EA18',
            'EA19',
            'EEA30_2007',
            'EEA31',
            'EFTA',
            'EU27_2007',
            'EU27_2020',
            'EU28'

        ]

        all_country_regions = [member.name for member in cls if member.name not in not_country_regions]

        return all_country_regions


In [30]:
import json


with open('./tests/fixtures/countries_list.json', 'w') as file:
    json.dump(Region.get_country_regions(), file)



In [2]:
from pathlib import Path
# from life_expectancy_refactored.strategy_class import Strategy

from abc import ABC, abstractmethod

class Strategy(ABC):
    @abstractmethod
    def run(self, input_path=None, input_dataframe=None, input_region=None):
        pass


class StrategyLoad(Strategy):

    def _load_json(self, input_path):
        df = pd.read_json(input_path)
        return df

    def _load_csv(self, input_path):
        df = pd.read_csv(input_path, header=0)
        return df

    def _load_tsv(self, input_path):
        df = pd.read_csv(input_path, sep= r'\,|\t', header=0, engine='python')
        return df

    def run(self, input_path=None, input_dataframe=None, input_region=None):
        accepted_file_extensions = {
            '.tsv': self._load_tsv,
            '.csv': self._load_csv,
            '.json': self._load_json
        }

        file_extension = Path(input_path).suffix

        return accepted_file_extensions[file_extension](input_path)
    

class StrategyClean(Strategy):
    def run(self, input_path=None, input_dataframe=None, input_region=None):
        df = input_dataframe.melt(id_vars=['unit', 'sex', 'age', 'geo\\time'])

        df = df.rename(columns={'geo\\time' : 'region', 'variable': 'year'})

        #Remove chars and mantain only numeric values without whitespaces
        df.value = df.value.apply(lambda x: None if ':' in x else x)
        df.value = df.value.str.replace(r'[^0-9.]', '', regex=True)

        df = df.astype({'year': int, 'value': float})

        df = df.dropna()

        return df
    
class StrategyFilter(Strategy):

    def run(self, input_path=None, input_dataframe=None, input_region=None):
        
        filter_region = input_region.name

        df_filtered = input_dataframe[input_dataframe['region'] == filter_region].reset_index(drop=True)

        return df_filtered

In [5]:
load = StrategyLoad()

df = load.run(input_path='./data/eu_life_expectancy_raw.tsv')

# df = load.run(input_path='./data/eurostat_life_expect.json')

clean = StrategyClean()

df_clean = clean.run(input_dataframe=df)

df_clean.reset_index(drop=True)

# filter = StrategyFilter()

# df_filter = filter.run(input_dataframe=df_clean, input_region=Region.PT)

Unnamed: 0,unit,sex,age,region,year,value
0,YR,F,Y65,AT,2021,21.2
1,YR,F,Y65,BE,2021,22.2
2,YR,F,Y65,BG,2021,15.6
3,YR,F,Y65,CH,2021,23.1
4,YR,F,Y65,CY,2021,21.2
...,...,...,...,...,...,...
474377,YR,T,Y_LT1,EE,1960,69.4
474378,YR,T,Y_LT1,HU,1960,68.1
474379,YR,T,Y_LT1,NO,1960,73.8
474380,YR,T,Y_LT1,PT,1960,64.0


In [7]:
data = pd.read_csv('./data/eu_life_expectancy_raw.tsv', sep= '\,|\t', header=0, engine='python')
data = data.melt(id_vars=['unit', 'sex', 'age', 'geo\\time'])
data = data.rename(columns={'geo\\time' : 'region', 'variable': 'year'})
data.value = data.value.apply(lambda x: None if ':' in x else x).str.replace(r'[^0-9.]', '', regex=True)
data = data.astype({'year': int, 'value': float})
data = data.dropna()
# data_only_pt = data[data['region'] == 'PT']
# data_only_pt
# data_only_pt.to_csv('./data/pt_life_expectancy.csv', index=False)

['AL' 'AM' 'AT' 'AZ' 'BE' 'BG' 'BY' 'CH' 'CY' 'CZ' 'DE' 'DE_TOT' 'DK'
 'EA18' 'EA19' 'EE' 'EEA30_2007' 'EEA31' 'EFTA' 'EL' 'ES' 'EU27_2007'
 'EU27_2020' 'EU28' 'FI' 'FR' 'FX' 'GE' 'HR' 'HU' 'IE' 'IS' 'IT' 'LI' 'LT'
 'LU' 'LV' 'MD' 'ME' 'MK' 'MT' 'NL' 'NO' 'PL' 'PT' 'RO' 'RS' 'RU' 'SE'
 'SI' 'SK' 'SM' 'TR' 'UA' 'UK' 'XK']


In [19]:
Region.PT.name

'PT'

In [9]:
Region.RU

<Region.RU: 48>

In [6]:
data_only_pt.dtypes

unit       object
sex        object
age        object
region     object
year        int64
value     float64
dtype: object