In [None]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import getsize, splitext

# Create Metadata Table

## Reference

[User guide](https://www.statcan.gc.ca/en/developers/csv/user-guide): What fields in the returned CSV's means. 

[Web Data Service](https://www.statcan.gc.ca/en/developers/wds): How to interact with the Statscan API

## Download

In [None]:
data_table = pd.read_json("https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesList")

In [None]:
data_table

## Merge Subject Codes

In [None]:
lookup_subject_codes = pd.read_csv("data/lookup/lookup-subject-codes.csv")
lookup_subject_codes

In [None]:
data_table_explosion = data_table.explode('subjectCode')
data_table_explosion = data_table_explosion.astype({'subjectCode':'int64'})
data_table_explosion

In [None]:
data_table_explosion.dtypes

In [None]:
data_table_explosion.productId.nunique()

In [None]:
data_table_merged = data_table_explosion.merge(lookup_subject_codes, how='left', left_on='subjectCode', right_on= 'Subject Code')
data_table_merged

In [None]:
def create_subject(x):
    d={}
    subject_object=[]
    for index, value in enumerate(x['subjectCode']):
        sub_obj = {'code': value, 'name': x['English'].tolist()[index]}
        subject_object.append(sub_obj)
    d['subject'] = subject_object
    return pd.Series(d, index=['subject'])


    
data_table_group = data_table_merged.groupby('productId').apply(create_subject)
data_table_group = data_table_group.reset_index()
data_table_group

In [None]:
data_table_mergeback = data_table_group.merge(data_table_merged, how='inner', on='productId')
data_table_mergeback = data_table_mergeback.drop(columns=['Subject Code','English','French', 'corrections'])
data_table_mergeback = data_table_mergeback.drop_duplicates(subset='productId')
data_table_mergeback

## Merge Available Files

In [None]:
output_path = 'data/output'
files = listdir(output_path)
sizes = []
files_under_5mb = []
files_under_5mb_size = []
for file in files:
    sizes.append(getsize(f"{output_path}/{file}"))
    if getsize(f"{output_path}/{file}") < 5000000:
        files_under_5mb.append(file)
        files_under_5mb_size.append(getsize(f"{output_path}/{file}"))

In [None]:
only_datasets = []
for file in files_under_5mb:
    if "filters" not in file:
        only_datasets.append(int(splitext(file)[0]))
print(len(only_datasets))

In [None]:
dataset_frame = pd.DataFrame(only_datasets, columns=['productId'])
merge_available = data_table_mergeback.merge(dataset_frame, how='left', on='productId', indicator=True)
merge_available['available'] = np.where(merge_available['_merge'] == 'both', True, False)
merge_available

## Output Final Metadata Table

In [None]:
merge_available.to_json("data/metadata/metadata.json", orient='records')

# Create Dimensions List

In [None]:
all_dimensions_list = []

for dimension_list in merge_available['dimensions']:
    for dimension in dimension_list:
        all_dimensions_list.append(dimension['dimensionNameEn'])
        

unique_set = set(all_dimensions_list)

print(len(unique_set))

In [None]:
dimensions = pd.DataFrame(unique_set, columns=['name'])

In [None]:
dimensions.to_json("data/metadata/dimensions.json", orient='records')