# ABR (Australian Business Register) Data Exploration

This notebook explores the ABR XML bulk extract data.

## Contents
1. Understanding ABR Data Structure
2. Loading and Parsing XML
3. Data Quality Analysis
4. Entity Distribution


In [None]:
# Import libraries
import sys
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Import our custom modules
from src.ingest.parse_abr import ABRParser, parse_abr_to_dataframe
from src.ingest.download_abr import create_sample_abr_data

print("Libraries imported successfully!")


## 2. Create Sample ABR Data


In [None]:
# Create sample ABR data file
sample_file = create_sample_abr_data(
    output_path="../data/raw/abr/sample_abr.xml",
    num_records=100
)
print(f"Created sample file: {sample_file}")

# Parse the sample file
df = parse_abr_to_dataframe(sample_file, max_records=100)
print(f"\nLoaded {len(df)} ABR records")
print(f"Columns: {list(df.columns)}")
df.head()


## 3. Data Quality Analysis


In [None]:
# Check data quality
print("Missing Values:")
print(df.isnull().sum())

print("\nEntity Status Distribution:")
print(df['entity_status'].value_counts())

print("\nState Distribution:")
print(df['state'].value_counts())
