# Life Expectancy

In [1]:
import pandas as pd
import requests, zipfile, io
import re
import os

## Technical Setup

In [2]:
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 200)

data_dir = '../data'
cache_dir = os.path.join(data_dir, 'cache')
import sys
sys.path.append('..')

from canadadata.statscan import to_wide_format, read_statscan_csv
from canadadata.io import unzip_data

ModuleNotFoundError: No module named 'canadadata'

## Download Data

In [None]:
unzip_data('https://www150.statcan.gc.ca/n1/tbl/csv/39100007-eng.zip', cache_dir)

## Life Expectancy Data

In [None]:
statscan_data = read_statscan_csv(os.path.join(cache_dir, '39100007.csv'))
lifeexp_data = to_wide_format(statscan_data)

In [None]:
lifeexp_data.memory_usage()

In [None]:
lifeexp_data.dtypes

In [None]:
lifeexp_data.SYMBOL.value_counts()

# Save to PyArrow

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

data_table = pa.Table.from_pandas(data)
pq.write_table(data_table, '../data/LifeExpectancy.parquet')

In [None]:
len(data)
data.Element.astype('category').memory_usage()

In [None]:
def get_elements(data):
    return data.Element.drop_duplicates().tolist()

def get_element_renames(data):
    return {el: '' for el in get_elements(data)}

get_element_renames(data)

In [None]:
data15 = data[data.REF_DATE =='2013/2015']

In [None]:
partition = 'REF_DATE'
dimensions = ['GEO', 'Age group', 'Sex','Element',]
value = ['VALUE']

In [None]:
REF_COLS = ['REF_DATE', 'GEO','DGUID', 'Age group', 'Sex','Element', 'UOM','UOM_ID','SCALAR_FACTOR','SCALAR_ID', 'VECTOR',
 'COORDINATE', 'VALUE','STATUS','SYMBOL', 'TERMINATED', 'DECIMALS']

In [None]:
lf_url = 'https://open.canada.ca/data/en/dataset/74ded0d6-c629-4a5f-bdd2-9c088c9b0d54'
'https://www150.statcan.gc.ca/n1/tbl/csv/13100114-eng.zip'

In [None]:
class StatscanDataset(object):
    
    def __init__(self, url:str, pivot_column:str):
        self.url = url
        self.pivot_column = pivot_column
        
    def get_data(self, cache_dir=cache_dir, wide=True):
        files = unzip_data(self.url, cache_dir)
        data = read_statscan_csv(files[0])
        if wide:
            data = to_wide_format(data, pivot_column=self.pivot_column)
        return data

## Life Expectancy Data

In [None]:
lifeexp_dataset = StatscanDataset('https://www150.statcan.gc.ca/n1/tbl/csv/13100114-eng.zip', 'Element')
lifeexp_data = lifeexp_dataset.get_data()

In [None]:
lifeexp_data

## Retail prices

In [None]:
retail_prices_dataset = StatscanDataset('https://www150.statcan.gc.ca/n1/tbl/csv/18100251-eng.zip', 
                                        pivot_column='North American Industry Classification System (NAICS)')
retail_prices = retail_prices_dataset.get_data()
retail_prices