In [12]:
import glob
import os
import re

import pandas as pd

In [13]:
root_dir = '../../data'
files = [f for f in glob.glob('*/*.csv', root_dir=root_dir, recursive=True) if not re.match(r'^(interim)', f) ]

In [45]:
def make_key(s: str, sep='-'):
      return re.sub(r'\W+', sep, s.lower()).strip(sep)

class Dimension:
    def __init__(self, series: pd.Series):
        self.name = series.name
        self.values = series.unique().tolist()

    def __repr__(self):
        return f'Dimension->{self.name}'
    
    def to_dict(self):
        return self.__dict__

class Fact:
    def __init__(self, series: pd.Series):
        self.name = series.name
        self.type = str(series.dtype)
        self.description = series.describe().to_dict()
        self.na = series.isna().sum()

    def __repr__(self):
        return f'Fact->{self.name} ({self.type})'

    def to_dict(self):
        return self.__dict__

class Metadata:
    def load(self, dataset_path: pd.DataFrame, loader=pd.read_csv, variables=['variable_name'], values=['value'], ignored=[], root_dir=''):
        self.id = make_key(os.path.basename(dataset_path))
        self.group = os.path.dirname(dataset_path).replace(os.sep, '.')

        dataset: pd.DataFrame = loader(os.path.join(root_dir, dataset_path))

        # Process variables
        fact_columns = variables + values
        dimension_columns = [c for c in dataset.columns.to_list() if c not in fact_columns + ignored]
        try:
            facts = dataset.pivot(index=dimension_columns, columns=variables, values=values).reset_index(drop=True)
        except:
            return None
        facts.columns = facts.columns.droplevel()
        self.facts = [Fact(f) for _, f in facts.items()]
        
        # Calculate dimension columns
        self.dimensions = [Dimension(x[1]) for x in dataset.loc[:, dimension_columns].items()]

        return self
    
    def __repr__(self):
        return f'Metadata->{self.dimensions}->{self.facts}'

    def to_dict(self):
        result = {k: v for k, v in self.__dict__.items() if k not in ['facts', 'dimensions']}
        result['facts'] = [f.to_dict() for f in self.facts]
        result['dimensions'] = [d.to_dict() for d in self.dimensions]
        return result


catalogue = [
  Metadata().load(f, root_dir=root_dir) for f in files
]

In [46]:
os.makedirs('../../src/_data/metadata/', exist_ok=True)

pd.DataFrame([entry.to_dict() for entry in catalogue if entry]).to_json(
  '../../src/_data/metadata/catalogue.json',
  orient='records',
  indent=2
)
