In [77]:
import polars as pl
import csv
import glob
import os

In [78]:
def clean_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        reader = csv.reader(infile, delimiter=";", quotechar='"', escapechar="\\")
        writer = csv.writer(outfile, delimiter=";", quotechar='"', escapechar="\\", quoting=csv.QUOTE_MINIMAL)
        
        for row in reader:
            if row.count('"') % 2 != 0:  
                continue
            max_columns = 24  
            row = row[:max_columns] + [""] * (max_columns - len(row))
            writer.writerow(row)

In [79]:
extract_folder = "extract"
cleaned_folder = "cleaned"

In [80]:
dat_files = glob.glob(f"{extract_folder}/*.dat")
print(f"Number of .dat files found: {len(dat_files)}")

Number of .dat files found: 25176


In [81]:
os.makedirs(cleaned_folder, exist_ok=True)

dat_files = glob.glob(f"{extract_folder}/*.dat")

for file in dat_files:
    cleaned_path = os.path.join(cleaned_folder, os.path.basename(file))
    clean_file(file, cleaned_path)

In [82]:
df_list = []
df_schema = {
    'Record Type': pl.Utf8,
    'District Code': pl.Utf8,
    'Property Id.': pl.Utf8,
    'Sale Counter': pl.Utf8,
    'Download Datetime': pl.Utf8,
    'Property Name': pl.Utf8,
    'Property Unit Number': pl.Utf8,
    'Property House Number': pl.Utf8,
    'Property Street Name': pl.Utf8,
    'Property Locality': pl.Utf8,
    'Property Post Code': pl.Utf8,
    'Area': pl.Utf8,
    'Area Type': pl.Utf8,
    'Contract Date': pl.Utf8,
    'Settlement Date': pl.Utf8,
    'Purchase Price': pl.Utf8,
    'Zoning': pl.Utf8,
    'Nature of Property': pl.Utf8,
    'Primary Purpose': pl.Utf8,
    'Strata Lot Number': pl.Utf8,
    'Component code': pl.Utf8,
    'Sale Code': pl.Utf8,
    '% Interest of Sale': pl.Utf8,
    'Dealing Number': pl.Utf8
}

In [85]:
dat_cleaned = glob.glob(f"{cleaned_folder}/*.dat")
for file in dat_cleaned:
    try:
        df = pl.read_csv(
            file,
            separator=";",
            has_header=False,
            schema = df_schema,
            truncate_ragged_lines=True,
            ignore_errors=True
        ).with_columns(
            [pl.col(column).cast(pl.Utf8) for column in df_schema.keys()])
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

In [86]:
if df_list:
    try:
        df_combined = pl.concat(df_list)
        df_filtered = df_combined.filter(pl.col("Record Type") == "B")
        df_filtered.write_csv("filtered_data.csv")
        print("Filtered data saved to 'filtered_data.csv'.")
    except Exception as e:
        print(f"Error during concatenation or filtering: {e}")
else:
    print("No valid DataFrames to process.")

Filtered data saved to 'filtered_data.csv'.


In [240]:
df_filtered.head()

Record Type,District Code,Property Id.,Sale Counter,Download Datetime,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Zoning,Nature of Property,Primary Purpose,Strata Lot Number,Component code,Sale Code,% Interest of Sale,Dealing Number
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""B""","""001""","""2857799""","""1""","""20240101 01:07""",,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""","""25.15""","""H""","""20231219""","""20231222""","""1330000""","""RU2""","""R""","""RESIDENCE""",,"""RAN""",,"""0""","""AT729586 """
"""B""","""001""","""4228""","""2""","""20240101 01:07""",,,"""2""","""KING ST""","""BRANXTON""","""2335""","""1864""","""M""","""20231115""","""20231222""","""850000""","""R3""","""R""","""RESIDENCE""",,"""MAB""",,"""0""","""AT731473 """
"""B""","""001""","""7750""","""3""","""20240101 01:07""",,,"""275""","""MAITLAND RD""","""CESSNOCK""","""2325""","""379.8""","""M""","""20231026""","""20231222""","""500000""","""R3""","""3""","""SHOP""",,"""MAA""",,"""0""","""AT729914 """
"""B""","""001""","""11439""","""4""","""20240101 01:07""",,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""","""1682""","""M""","""20231103""","""20231222""","""575000""","""R2""","""R""","""RESIDENCE""",,"""AAO""",,"""0""","""AT731374 """
"""B""","""001""","""18319""","""5""","""20240101 01:07""",,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""","""31.77""","""H""","""20231218""","""20231222""","""236667""","""RU2""","""R""","""RESIDENCE""",,"""RAG""",,"""0""","""AT733421 """


In [241]:
df_cleaning = df_filtered.with_columns(
    pl.col("Contract Date").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("Contract Date")
)

In [242]:
df_cleaning = df_cleaning.select('District Code',
                            'Property Id.',
                            'Property Name',
                            'Property Unit Number',
                            'Property House Number',
                            'Property Street Name',
                            'Property Locality',
                            'Property Post Code',
                            'Area',
                            'Area Type',
                            'Contract Date',
                            'Settlement Date',
                            'Purchase Price',
                            'Primary Purpose',
                            'Strata Lot Number')

In [243]:
df_cleaning = df_cleaning.with_columns(pl.col("Area").cast(pl.Float64).alias("Area"))

In [244]:
df_cleaning = df_cleaning.with_columns(
    pl.when(pl.col('Area Type') == 'M')
     .then(pl.col('Area').cast(pl.Float64))
     .when(pl.col('Area Type') == 'H')
     .then(pl.col('Area').cast(pl.Float64) * 10000)
     .otherwise(None)
     .alias('Area (m)')
)

In [245]:
df_cleaning = df_cleaning.with_columns(pl.col("Purchase Price").cast(pl.Int64).alias("Purchase Price"))

In [246]:
df_cleaning = df_cleaning.with_columns(pl.col("Property Id.").cast(pl.Int64).alias("Property Id."))

In [247]:
df_cleaning = df_cleaning.with_columns(
    pl.col("Settlement Date").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("Settlement Date")
)

In [248]:
df_cleaning = df_cleaning.with_columns([
    pl.col('Settlement Date').dt.year().alias('Settlement Year'),
    pl.col('Settlement Date').dt.month().alias('Settlement Month')
])


In [249]:
df_cleaning = df_cleaning.filter(
    pl.col('Settlement Year') >= 2021
)

In [250]:
df_cleaning = df_cleaning.with_columns((pl.col("Purchase Price") / pl.col("Area (m)")).alias("Price per m^2").round(2))

In [251]:
df_cleaning.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Area (m),Settlement Year,Settlement Month,Price per m^2
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64,i32,i8,f64
"""001""",2857799,,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""",25.15,"""H""",2023-12-19,2023-12-22,1330000,"""RESIDENCE""",,251500.0,2023,12,5.29
"""001""",4228,,,"""2""","""KING ST""","""BRANXTON""","""2335""",1864.0,"""M""",2023-11-15,2023-12-22,850000,"""RESIDENCE""",,1864.0,2023,12,456.01
"""001""",7750,,,"""275""","""MAITLAND RD""","""CESSNOCK""","""2325""",379.8,"""M""",2023-10-26,2023-12-22,500000,"""SHOP""",,379.8,2023,12,1316.48
"""001""",11439,,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""",1682.0,"""M""",2023-11-03,2023-12-22,575000,"""RESIDENCE""",,1682.0,2023,12,341.85
"""001""",18319,,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""",31.77,"""H""",2023-12-18,2023-12-22,236667,"""RESIDENCE""",,317700.0,2023,12,0.74


In [252]:
df_cleaning.n_unique

<bound method DataFrame.n_unique of shape: (806_780, 19)
┌──────────┬──────────────┬───────────┬──────────────────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ District ┆ Property Id. ┆ Property  ┆ Property Unit Number ┆ … ┆ Area (m)   ┆ Settlement ┆ Settlement ┆ Price per │
│ Code     ┆ ---          ┆ Name      ┆ ---                  ┆   ┆ ---        ┆ Year       ┆ Month      ┆ m^2       │
│ ---      ┆ i64          ┆ ---       ┆ str                  ┆   ┆ f64        ┆ ---        ┆ ---        ┆ ---       │
│ str      ┆              ┆ str       ┆                      ┆   ┆            ┆ i32        ┆ i8         ┆ f64       │
╞══════════╪══════════════╪═══════════╪══════════════════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ 001      ┆ 2857799      ┆ null      ┆ null                 ┆ … ┆ 251500.0   ┆ 2023       ┆ 12         ┆ 5.29      │
│ 001      ┆ 4228         ┆ null      ┆ null                 ┆ … ┆ 1864.0     ┆ 2023       ┆ 12         ┆ 456.01    │

In [253]:
unit_df = df_cleaning.filter((pl.col("Property Unit Number") != "null") | (pl.col("Strata Lot Number") != "null"))

In [254]:
unit_df.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Area (m),Settlement Year,Settlement Month,Price per m^2
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64,i32,i8,f64
"""001""",3691928,,"""1""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""1""",,2023,12,
"""001""",3691928,,"""2""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""2""",,2023,12,
"""001""",3691928,,"""3""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""3""",,2023,12,
"""001""",3799095,,"""1""","""24 A""","""ZINFANDEL CCT""","""CESSNOCK""","""2325""",218.0,"""M""",2023-11-10,2023-12-22,525000,"""RESIDENCE""","""1""",218.0,2023,12,2408.26
"""001""",3029751,"""GOLDEN DOOR HEALTH RETREAT AND…","""16""","""165""","""THOMPSONS RD""","""POKOLBIN""","""2320""",,,2020-11-25,2021-01-22,100000,"""RESIDENCE""","""16""",,2021,1,


In [255]:
unit_df_filter = unit_df.filter(pl.col("Primary Purpose").is_in(["UNIT", "RESIDENCE", "HOUSE UNIT"]))

In [256]:
unit_df_filter.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Area (m),Settlement Year,Settlement Month,Price per m^2
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64,i32,i8,f64
"""001""",3691928,,"""1""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""1""",,2023,12,
"""001""",3691928,,"""2""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""2""",,2023,12,
"""001""",3691928,,"""3""","""18""","""FAIRVIEW PL""","""CESSNOCK""","""2325""",,,2023-11-10,2023-12-22,1550000,"""RESIDENCE""","""3""",,2023,12,
"""001""",3799095,,"""1""","""24 A""","""ZINFANDEL CCT""","""CESSNOCK""","""2325""",218.0,"""M""",2023-11-10,2023-12-22,525000,"""RESIDENCE""","""1""",218.0,2023,12,2408.26
"""001""",3029751,"""GOLDEN DOOR HEALTH RETREAT AND…","""16""","""165""","""THOMPSONS RD""","""POKOLBIN""","""2320""",,,2020-11-25,2021-01-22,100000,"""RESIDENCE""","""16""",,2021,1,


In [270]:
output_file2 = "./units.csv"

unit_df_filter.write_csv(output_file2)

In [271]:
unit_groupby = unit_df_filter.group_by(['Property Locality', 'Settlement Year']).agg(pl.col('Price per m^2').mean().alias('Average Price per m^2 per year').round(2))
unit_groupby.sort(['Property Locality', 'Settlement Year'])

Property Locality,Settlement Year,Average Price per m^2 per year
str,i32,f64
"""ABBOTSBURY""",2021,2653.68
"""ABBOTSFORD""",2021,6215.51
"""ABBOTSFORD""",2022,5805.39
"""ABBOTSFORD""",2023,10195.46
"""ABBOTSFORD""",2024,9810.86
…,…,…
"""YOWIE BAY""",2024,5393.94
"""ZETLAND""",2021,10411.85
"""ZETLAND""",2022,10388.3
"""ZETLAND""",2023,11018.7


In [272]:
output_file3 = "./units_grouped.csv"

unit_groupby.write_csv(output_file3)

In [273]:
house_df = df_cleaning.filter(
    (pl.col("Property Unit Number").is_null() & pl.col("Strata Lot Number").is_null()))

In [274]:
house_df.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Area (m),Settlement Year,Settlement Month,Price per m^2
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64,i32,i8,f64
"""001""",2857799,,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""",25.15,"""H""",2023-12-19,2023-12-22,1330000,"""RESIDENCE""",,251500.0,2023,12,5.29
"""001""",4228,,,"""2""","""KING ST""","""BRANXTON""","""2335""",1864.0,"""M""",2023-11-15,2023-12-22,850000,"""RESIDENCE""",,1864.0,2023,12,456.01
"""001""",7750,,,"""275""","""MAITLAND RD""","""CESSNOCK""","""2325""",379.8,"""M""",2023-10-26,2023-12-22,500000,"""SHOP""",,379.8,2023,12,1316.48
"""001""",11439,,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""",1682.0,"""M""",2023-11-03,2023-12-22,575000,"""RESIDENCE""",,1682.0,2023,12,341.85
"""001""",18319,,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""",31.77,"""H""",2023-12-18,2023-12-22,236667,"""RESIDENCE""",,317700.0,2023,12,0.74


In [275]:
house_df_filter = house_df.filter(pl.col("Primary Purpose").is_in(["RESIDENCE", "DWELLING", "HOUSE AND FARM"]))

In [276]:
house_df_filter.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Area (m),Settlement Year,Settlement Month,Price per m^2
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64,i32,i8,f64
"""001""",2857799,,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""",25.15,"""H""",2023-12-19,2023-12-22,1330000,"""RESIDENCE""",,251500.0,2023,12,5.29
"""001""",4228,,,"""2""","""KING ST""","""BRANXTON""","""2335""",1864.0,"""M""",2023-11-15,2023-12-22,850000,"""RESIDENCE""",,1864.0,2023,12,456.01
"""001""",11439,,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""",1682.0,"""M""",2023-11-03,2023-12-22,575000,"""RESIDENCE""",,1682.0,2023,12,341.85
"""001""",18319,,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""",31.77,"""H""",2023-12-18,2023-12-22,236667,"""RESIDENCE""",,317700.0,2023,12,0.74
"""001""",3073376,,,"""35 A""","""COLLIERY ST""","""ABERDARE""","""2325""",3869.8,"""M""",2023-11-21,2023-12-22,1490000,"""RESIDENCE""",,3869.8,2023,12,385.03


In [277]:
output_file4 = "./house.csv"

house_df_filter.write_csv(output_file4)

In [278]:
house_groupby = house_df_filter.group_by(['Property Locality', 'Settlement Year']).agg(pl.col('Price per m^2').mean().alias('Average Price per m^2 per year').round(2))
house_groupby.sort(['Property Locality', 'Settlement Year'])


Property Locality,Settlement Year,Average Price per m^2 per year
str,i32,f64
"""AARONS PASS""",2022,1.35
"""AARONS PASS""",2024,1.0
"""ABBOTSBURY""",2021,1781.23
"""ABBOTSBURY""",2022,2318.06
"""ABBOTSBURY""",2023,2203.31
…,…,…
"""ZARA""",2024,7.09
"""ZETLAND""",2021,13354.66
"""ZETLAND""",2022,11080.24
"""ZETLAND""",2023,13850.57


In [279]:
house_groupby.head()

Property Locality,Settlement Year,Average Price per m^2 per year
str,i32,f64
"""BRUNSWICK HEADS""",2021,2950.27
"""PARRAMATTA""",2023,4950.59
"""HALLSVILLE""",2024,115.27
"""BEACON HILL""",2022,3743.84
"""CEDAR PARTY""",2021,11.29


In [280]:
output_file5 = "./house_grouped.csv"

house_groupby.write_csv(output_file5)