## Step 1: Discover all CSV files under `retail_data/`

In [None]:

import glob
src_file_names = glob.glob('retail_data/**/*.csv', recursive=True)
src_file_names


## Step 2: Extract dataset names from file paths

In [None]:

import re

def extract_dataset_names(file_list):
    pattern = r"retail_data/(\w+)\.csv"
    results = []
    for f in file_list:
        m = re.search(pattern, f)
        if m:
            dataset_name = m.group(1)
            results.append((f, dataset_name))
    return results

datasets = extract_dataset_names(src_file_names)
print("Datasets discovered:")
for f, n in datasets:
    print(f"{n} → {f}")


## Step 3: Load schema definitions from `retail_schema/schemas.json`

In [None]:

import json

def load_schemas(schema_path='retail_schema/schemas.json'):
    with open(schema_path, 'r') as f:
        schemas = json.load(f)
    print("Schemas loaded successfully.")
    return schemas

schemas = load_schemas()


## Step 4: Read dataset with schema applied

In [None]:

import pandas as pd
import numpy as np

def read_dataset_with_schema(file_path, dataset_name, schemas):
    schema = schemas.get(dataset_name)
    if schema is None:
        raise ValueError(f"No schema found for dataset: {dataset_name}")
    
    columns = sorted(schema, key=lambda x: x['column_position'])
    col_names = [col['column_name'] for col in columns]
    
    dtype_map = {}
    for col in schema:
        dtype = col['data_type']
        if dtype == 'integer':
            dtype_map[col['column_name']] = 'Int64'
        elif dtype == 'float':
            dtype_map[col['column_name']] = 'float'
        elif dtype == 'string' or dtype == '':
            dtype_map[col['column_name']] = 'string'
        elif dtype == 'timestamp':
            dtype_map[col['column_name']] = 'string'
        else:
            dtype_map[col['column_name']] = 'string'

    df = pd.read_csv(file_path, names=col_names, dtype=dtype_map, header=None)
    
    timestamp_cols = [c['column_name'] for c in schema if c['data_type'] == 'timestamp']
    for col in timestamp_cols:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
        except Exception:
            pass
    
    return df


## Step 5: Read all datasets and apply schemas

In [None]:

dataframes = {}

for file_path, ds_name in datasets:
    df = read_dataset_with_schema(file_path, ds_name, schemas)
    dataframes[ds_name] = df
    print(f"Loaded {ds_name} → {df.shape}")


## Step 6: Validate dataset loading

In [None]:

expected = {"categories", "customers", "departments", "order_items", "orders", "products"}
loaded = set(dataframes.keys())

if expected == loaded:
    print("All 6 datasets loaded successfully.")
else:
    print("Some datasets missing.")
    print("Expected:", expected)
    print("Loaded:", loaded)


## Step 7: Display sample records from each dataset

In [None]:

for name, df in dataframes.items():
    print(f"\n{name.upper()} SAMPLE:")
    display(df.head())
