In [1]:
import polars as pl
import csv
import glob
import os

In [2]:
def clean_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        reader = csv.reader(infile, delimiter=";", quotechar='"', escapechar="\\")
        writer = csv.writer(outfile, delimiter=";", quotechar='"', escapechar="\\", quoting=csv.QUOTE_MINIMAL)
        
        for row in reader:
            if row.count('"') % 2 != 0:  
                continue
            max_columns = 24  
            row = row[:max_columns] + [""] * (max_columns - len(row))
            writer.writerow(row)

In [3]:
extract_folder = "extract"
cleaned_folder = "cleaned"

In [4]:
dat_files = glob.glob(f"{extract_folder}/*.dat")
print(f"Number of .dat files found: {len(dat_files)}")

Number of .dat files found: 25176


In [5]:
os.makedirs(cleaned_folder, exist_ok=True)

dat_files = glob.glob(f"{extract_folder}/*.dat")

for file in dat_files:
    cleaned_path = os.path.join(cleaned_folder, os.path.basename(file))
    clean_file(file, cleaned_path)

In [6]:
df_list = []
df_schema = {
    'recordType': pl.Utf8,
    'districtCode': pl.Utf8,
    'propertyId': pl.Utf8,
    'saleCounter': pl.Utf8,
    'downloadDatetime': pl.Utf8,
    'propertyName': pl.Utf8,
    'propertyUnitNumber': pl.Utf8,
    'propertyHouseNumber': pl.Utf8,
    'propertyStreetName': pl.Utf8,
    'propertyLocality': pl.Utf8,
    'propertyPostCode': pl.Utf8,
    'area': pl.Utf8,
    'areaType': pl.Utf8,
    'contractDate': pl.Utf8,
    'settlementDate': pl.Utf8,
    'purchasePrice': pl.Utf8,
    'zoning': pl.Utf8,
    'natureofProperty': pl.Utf8,
    'primaryPurpose': pl.Utf8,
    'strataLotNumber': pl.Utf8,
    'componentCode': pl.Utf8,
    'saleCode': pl.Utf8,
    'InterestofSale%': pl.Utf8,
    'dealingNumber': pl.Utf8
}

In [7]:
dat_cleaned = glob.glob(f"{cleaned_folder}/*.dat")
for file in dat_cleaned:
    try:
        df = pl.read_csv(
            file,
            separator=";",
            has_header=False,
            schema = df_schema,
            truncate_ragged_lines=True,
            ignore_errors=True
        ).with_columns(
            [pl.col(column).cast(pl.Utf8) for column in df_schema.keys()])
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

In [8]:
if df_list:
    try:
        df_combined = pl.concat(df_list)
        df_filtered = df_combined.filter(pl.col("recordType") == "B")
        df_filtered.write_csv("filtered_data.csv")
        print("Filtered data saved to 'filtered_data.csv'.")
    except Exception as e:
        print(f"Error during concatenation or filtering: {e}")
else:
    print("No valid DataFrames to process.")

Filtered data saved to 'filtered_data.csv'.


In [9]:
df_filtered.head()

recordType,districtCode,propertyId,saleCounter,downloadDatetime,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,zoning,natureofProperty,primaryPurpose,strataLotNumber,componentCode,saleCode,InterestofSale%,dealingNumber
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""B""","""001""","""2857799""","""1""","""20240101 01:07""",,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""","""25.15""","""H""","""20231219""","""20231222""","""1330000""","""RU2""","""R""","""RESIDENCE""",,"""RAN""",,"""0""","""AT729586 """
"""B""","""001""","""4228""","""2""","""20240101 01:07""",,,"""2""","""KING ST""","""BRANXTON""","""2335""","""1864""","""M""","""20231115""","""20231222""","""850000""","""R3""","""R""","""RESIDENCE""",,"""MAB""",,"""0""","""AT731473 """
"""B""","""001""","""7750""","""3""","""20240101 01:07""",,,"""275""","""MAITLAND RD""","""CESSNOCK""","""2325""","""379.8""","""M""","""20231026""","""20231222""","""500000""","""R3""","""3""","""SHOP""",,"""MAA""",,"""0""","""AT729914 """
"""B""","""001""","""11439""","""4""","""20240101 01:07""",,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""","""1682""","""M""","""20231103""","""20231222""","""575000""","""R2""","""R""","""RESIDENCE""",,"""AAO""",,"""0""","""AT731374 """
"""B""","""001""","""18319""","""5""","""20240101 01:07""",,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""","""31.77""","""H""","""20231218""","""20231222""","""236667""","""RU2""","""R""","""RESIDENCE""",,"""RAG""",,"""0""","""AT733421 """


In [10]:
df_cleaning = df_filtered.with_columns(
    pl.col("contractDate").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("contractDate")
)

In [11]:
df_cleaning = df_cleaning.select('districtCode',
                            'propertyId',
                            'propertyName',
                            'propertyUnitNumber',
                            'propertyHouseNumber',
                            'propertyStreetName',
                            'propertyLocality',
                            'propertyPostCode',
                            'area',
                            'areaType',
                            'contractDate',
                            'settlementDate',
                            'purchasePrice',
                            'primaryPurpose',
                            'strataLotNumber')

In [12]:
df_cleaning = df_cleaning.with_columns(pl.col("area").cast(pl.Float64).alias("area"))

In [13]:
df_cleaning = df_cleaning.with_columns(
    pl.when(pl.col('areaType') == 'M')
     .then(pl.col('area').cast(pl.Float64))
     .when(pl.col('areaType') == 'H')
     .then(pl.col('area').cast(pl.Float64) * 10000)
     .otherwise(None)
     .alias('area(m)')
)

In [14]:
df_cleaning = df_cleaning.with_columns(pl.col("purchasePrice").cast(pl.Int64).alias("purchasePrice"))

In [15]:
df_cleaning = df_cleaning.with_columns(pl.col("propertyId").cast(pl.Int64).alias("propertyId"))

In [16]:
df_cleaning = df_cleaning.with_columns(
    pl.col("settlementDate").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("settlementDate")
)

In [17]:
df_cleaning = df_cleaning.with_columns([
    pl.col('settlementDate').dt.year().alias('settlementYear'),
    pl.col('settlementDate').dt.month().alias('settlementMonth')
])


In [18]:
df_cleaning = df_cleaning.filter(
    pl.col('settlementYear') >= 2021
)

In [19]:
df_cleaning = df_cleaning.with_columns((pl.col("purchasePrice") / pl.col("area(m)")).alias("pricePerM2").round(2))

In [20]:
df_cleaning.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64
"""001""",2857799,,,"""176""","""LAKE RD""","""ELRINGTON""","""2325""",25.15,"""H""","""20231219""",2023-12-22,1330000,"""RESIDENCE""",,251500.0,2023,12,5.29
"""001""",4228,,,"""2""","""KING ST""","""BRANXTON""","""2335""",1864.0,"""M""","""20231115""",2023-12-22,850000,"""RESIDENCE""",,1864.0,2023,12,456.01
"""001""",7750,,,"""275""","""MAITLAND RD""","""CESSNOCK""","""2325""",379.8,"""M""","""20231026""",2023-12-22,500000,"""SHOP""",,379.8,2023,12,1316.48
"""001""",11439,,,"""19""","""DEAKIN ST""","""KURRI KURRI""","""2327""",1682.0,"""M""","""20231103""",2023-12-22,575000,"""RESIDENCE""",,1682.0,2023,12,341.85
"""001""",18319,,,"""700""","""MILSONS ARM RD""","""WOLLOMBI""","""2325""",31.77,"""H""","""20231218""",2023-12-22,236667,"""RESIDENCE""",,317700.0,2023,12,0.74


In [21]:
df_cleaning_unique = df_cleaning.unique(keep="last")

In [22]:
df_cleaning_unique.n_unique()

792946

In [23]:
cleaned_df = df_cleaning_unique.with_columns(pl.arange(0, df_cleaning_unique.height).alias("uniqueSaleKey"))

In [24]:
cleaned_df.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2,uniqueSaleKey
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64,i64
"""005""",4079197,,"""2""","""387""","""NEW ENGLAND HWY""","""RUTHERFORD""","""2320""",617.0,"""M""","""20220202""",2022-03-30,1300000,"""COMMERCIAL""","""2""",617.0,2022,3,2106.97,0
"""004""",4455621,,"""3""","""14 B""","""KAROONDA CL""","""RATHMINES""","""2283""",113.0,"""M""","""20220526""",2023-02-17,330000,"""FACTORY""","""3""",113.0,2023,2,2920.35,1
"""207""",2019481,,"""18""","""101""","""WENTWORTH ST""","""RANDWICK""","""2031""",,,"""20230914""",2023-11-09,850000,"""RESIDENCE""","""18""",,2023,11,,2
"""004""",164984,,"""1""","""30""","""HUDSON ST""","""WHITEBRIDGE""","""2290""",,,"""20210804""",2021-09-08,625000,"""RESIDENCE""","""1""",,2021,9,,3
"""223""",2500291,,,"""2""","""KATHERINE ST""","""CECIL HILLS""","""2171""",889.4,"""M""","""20231202""",2024-01-29,1600000,"""RESIDENCE""",,889.4,2024,1,1798.97,4


In [25]:
output_file = "./cleaned.csv"

cleaned_df.write_csv(output_file)

In [26]:
unit_df = cleaned_df.filter((pl.col("propertyUnitNumber") != "null") | (pl.col("strataLotNumber") != "null"))

In [27]:
unit_df.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2,uniqueSaleKey
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64,i64
"""005""",4079197,,"""2""","""387""","""NEW ENGLAND HWY""","""RUTHERFORD""","""2320""",617.0,"""M""","""20220202""",2022-03-30,1300000,"""COMMERCIAL""","""2""",617.0,2022,3,2106.97,0
"""004""",4455621,,"""3""","""14 B""","""KAROONDA CL""","""RATHMINES""","""2283""",113.0,"""M""","""20220526""",2023-02-17,330000,"""FACTORY""","""3""",113.0,2023,2,2920.35,1
"""207""",2019481,,"""18""","""101""","""WENTWORTH ST""","""RANDWICK""","""2031""",,,"""20230914""",2023-11-09,850000,"""RESIDENCE""","""18""",,2023,11,,2
"""004""",164984,,"""1""","""30""","""HUDSON ST""","""WHITEBRIDGE""","""2290""",,,"""20210804""",2021-09-08,625000,"""RESIDENCE""","""1""",,2021,9,,3
"""708""",3603342,,"""217""","""13""","""BAYSWATER RD""","""POTTS POINT""","""2011""",,,"""20230125""",2023-02-17,1100000,"""RESIDENCE""","""17""",,2023,2,,7


In [28]:
unit_df_filter = unit_df.filter(pl.col("primaryPurpose").is_in(["UNIT", "RESIDENCE", "HOUSE UNIT"]))

In [29]:
unit_df_filter.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2,uniqueSaleKey
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64,i64
"""207""",2019481,,"""18""","""101""","""WENTWORTH ST""","""RANDWICK""","""2031""",,,"""20230914""",2023-11-09,850000,"""RESIDENCE""","""18""",,2023,11,,2
"""004""",164984,,"""1""","""30""","""HUDSON ST""","""WHITEBRIDGE""","""2290""",,,"""20210804""",2021-09-08,625000,"""RESIDENCE""","""1""",,2021,9,,3
"""708""",3603342,,"""217""","""13""","""BAYSWATER RD""","""POTTS POINT""","""2011""",,,"""20230125""",2023-02-17,1100000,"""RESIDENCE""","""17""",,2023,2,,7
"""082""",4073357,,"""105""","""450""","""PEATS FERRY RD""","""ASQUITH""","""2077""",112.0,"""M""","""20211117""",2021-12-13,630000,"""RESIDENCE""","""14""",112.0,2021,12,5625.0,9
"""276""",3607706,"""FUSION ARNCLIFFE""","""322""","""18""","""BONAR ST""","""ARNCLIFFE""","""2205""",,,"""20230816""",2023-09-27,515000,"""RESIDENCE""","""140""",,2023,9,,16


In [30]:
unit_df_filter.n_unique()

258602

In [31]:
output_file2 = "./units.csv"

unit_df_filter.write_csv(output_file2)

In [32]:
unit_groupby = unit_df_filter.group_by(['propertyLocality', 'settlementYear']).agg(pl.col('pricePerM2').mean().alias('avgPricePerM2').round(2))
unit_groupby.sort(['propertyLocality', 'settlementYear'])

propertyLocality,settlementYear,avgPricePerM2
str,i32,f64
"""ABBOTSBURY""",2021,2653.68
"""ABBOTSFORD""",2021,6215.51
"""ABBOTSFORD""",2022,5805.39
"""ABBOTSFORD""",2023,10195.46
"""ABBOTSFORD""",2024,9810.86
…,…,…
"""YOWIE BAY""",2024,5393.94
"""ZETLAND""",2021,10411.85
"""ZETLAND""",2022,10388.3
"""ZETLAND""",2023,11018.7


In [33]:
output_file3 = "./units_grouped.csv"

unit_groupby.write_csv(output_file3)

In [50]:
df = pl.read_csv("units_grouped.csv")

In [51]:
unit_pivot_df = df.pivot(
    index="propertyLocality",
    columns="settlementYear",
    values="avgPricePerM2",
    aggregate_function=None  # Use this if you need to aggregate duplicate entries
)

  unit_pivot_df = df.pivot(


In [58]:
unit_pivot_df = unit_pivot_df[["propertyLocality", "2021", "2022", "2023", "2024"]]

In [59]:
unit_pivot_df.head()

propertyLocality,2021,2022,2023,2024
str,f64,f64,f64,f64
"""TOONGABBIE""",3957.02,4631.85,4205.63,5111.62
"""GUNDAGAI""",1446.29,913.04,1556.25,1546.32
"""SINGLETON HEIGHTS""",807.25,1273.41,,1049.36
"""HOMEBUSH WEST""",7953.36,5278.8,5551.58,5700.36
"""COLYTON""",3330.31,2727.27,1851.71,1373.84


In [60]:
output_file_units = "Data/units_data.csv"

unit_pivot_df.write_csv(output_file_units)

In [61]:
house_df = cleaned_df.filter(
    (pl.col("propertyUnitNumber").is_null() & pl.col("strataLotNumber").is_null()))

In [62]:
house_df.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2,uniqueSaleKey
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64,i64
"""223""",2500291,,,"""2""","""KATHERINE ST""","""CECIL HILLS""","""2171""",889.4,"""M""","""20231202""",2024-01-29,1600000,"""RESIDENCE""",,889.4,2024,1,1798.97,4
"""081""",4412826,,,"""6""","""PILASTER ST""","""GABLES""","""2765""",450.5,"""M""","""20201027""",2022-03-16,579990,"""VACANT LAND""",,450.5,2022,3,1287.44,5
"""217""",2280647,,,"""17""","""BOURKE PL""","""CAMDEN SOUTH""","""2570""",809.4,"""M""","""20210511""",2021-05-25,508560,"""RESIDENCE""",,809.4,2021,5,628.32,6
"""273""",2664160,,,"""1""","""INGEBYRA ST""","""JINDABYNE""","""2627""",853.6,"""M""","""20220511""",2022-06-06,1331000,"""RESIDENCE""",,853.6,2022,6,1559.28,8
"""007""",200152,,,"""25""","""THOMPSON ST""","""MUSWELLBROOK""","""2333""",701.9,"""M""","""20230125""",2023-03-08,392500,"""RESIDENCE""",,701.9,2023,3,559.2,10


In [63]:
house_df_filter = house_df.filter(pl.col("primaryPurpose").is_in(["RESIDENCE", "DWELLING", "HOUSE AND FARM"]))

In [64]:
house_df_filter.head()

districtCode,propertyId,propertyName,propertyUnitNumber,propertyHouseNumber,propertyStreetName,propertyLocality,propertyPostCode,area,areaType,contractDate,settlementDate,purchasePrice,primaryPurpose,strataLotNumber,area(m),settlementYear,settlementMonth,pricePerM2,uniqueSaleKey
str,i64,str,str,str,str,str,str,f64,str,str,date,i64,str,str,f64,i32,i8,f64,i64
"""223""",2500291,,,"""2""","""KATHERINE ST""","""CECIL HILLS""","""2171""",889.4,"""M""","""20231202""",2024-01-29,1600000,"""RESIDENCE""",,889.4,2024,1,1798.97,4
"""217""",2280647,,,"""17""","""BOURKE PL""","""CAMDEN SOUTH""","""2570""",809.4,"""M""","""20210511""",2021-05-25,508560,"""RESIDENCE""",,809.4,2021,5,628.32,6
"""273""",2664160,,,"""1""","""INGEBYRA ST""","""JINDABYNE""","""2627""",853.6,"""M""","""20220511""",2022-06-06,1331000,"""RESIDENCE""",,853.6,2022,6,1559.28,8
"""007""",200152,,,"""25""","""THOMPSON ST""","""MUSWELLBROOK""","""2333""",701.9,"""M""","""20230125""",2023-03-08,392500,"""RESIDENCE""",,701.9,2023,3,559.2,10
"""004""",150249,,,"""28""","""OXLEY ST""","""SWANSEA""","""2281""",841.0,"""M""","""20241004""",2024-11-01,1173000,"""RESIDENCE""",,841.0,2024,11,1394.77,11


In [65]:
house_df_filter.n_unique()

419496

In [66]:
output_file4 = "./house.csv"

house_df_filter.write_csv(output_file4)

In [67]:
house_groupby = house_df_filter.group_by(['propertyLocality', 'settlementYear']).agg(pl.col('pricePerM2').mean().alias('avgPricePerM2').round(2))
house_groupby.sort(['propertyLocality', 'settlementYear'])


propertyLocality,settlementYear,avgPricePerM2
str,i32,f64
"""AARONS PASS""",2022,1.35
"""AARONS PASS""",2024,1.0
"""ABBOTSBURY""",2021,1781.23
"""ABBOTSBURY""",2022,2318.06
"""ABBOTSBURY""",2023,2203.31
…,…,…
"""ZARA""",2024,7.09
"""ZETLAND""",2021,13354.66
"""ZETLAND""",2022,11080.24
"""ZETLAND""",2023,13850.57


In [68]:
house_groupby.head()

propertyLocality,settlementYear,avgPricePerM2
str,i32,f64
"""BOSSLEY PARK""",2022,1840.17
"""BARRAGANYATTI""",2021,11.84
"""NORTH MACKSVILLE""",2022,705.97
"""CANOELANDS""",2022,17.27
"""FULLERTON""",2022,3.55


In [69]:
output_file5 = "./house_grouped.csv"

house_groupby.write_csv(output_file5)

In [70]:
df = pl.read_csv("house_grouped.csv")

In [71]:
house_pivot_df = df.pivot(
    index="propertyLocality",
    columns="settlementYear",
    values="avgPricePerM2",
    aggregate_function=None  # Use this if you need to aggregate duplicate entries
)

  house_pivot_df = df.pivot(


In [72]:
house_pivot_df = house_pivot_df[["propertyLocality", "2021", "2022", "2023", "2024"]]

In [73]:
house_pivot_df.head()

propertyLocality,2021,2022,2023,2024
str,f64,f64,f64,f64
"""BOSSLEY PARK""",1709.53,1840.17,1876.51,2152.69
"""BARRAGANYATTI""",11.84,7.13,7.4,1.6
"""NORTH MACKSVILLE""",294.21,705.97,641.89,552.09
"""CANOELANDS""",19.78,17.27,21.23,18.73
"""FULLERTON""",,3.55,0.76,2.14


In [74]:
output_file_units = "Data/houses_data.csv"

house_pivot_df.write_csv(output_file_units)