In [None]:
# === CHECK MARTS (Pandas) ===============================================
# Use in Jupyter (cell by cell) or save as notebooks/check_marts.py and run.
# ------------------------------------------------------------------------

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Paths
BASE_DIR = Path(__file__).resolve().parents[2] if "__file__" in globals() else Path("..").resolve()
DATA_DIR = BASE_DIR / "data"
MARTS_DIR = DATA_DIR / "marts"

print(MARTS_DIR)

# Partitioned parquet directories
FACT_BY_TYPE_DIR      = MARTS_DIR / "fact_prices"            # yearly agg by (district, property_type, year)
FACT_BY_DISTRICT_DIR  = MARTS_DIR / "fact_prices_district"   # yearly agg by (district, year)

# Single Parquet (coalesced to 1) with 5y bounds
BOUNDS_5Y_PATH        = MARTS_DIR / "agg_bounds_5y"

TRANSFORMED = DATA_DIR / "transformed"

RAW = DATA_DIR / "raw" / "pp_complete.csv"

T_CSV = BASE_DIR / "ppdata_with_header.csv"

# Show full floats without scientific notation
pd.set_option("display.float_format", "{:,.0f}".format)

/Users/renatopavlekovic/Documents/Digital Futures/property-pulse-uk/data/marts


In [None]:
# Load marts

# Dataset for price tracker and map
df_type = pd.read_parquet(FACT_BY_TYPE_DIR)

# Dataset for price prediction (full)
df_parq = pd.read_parquet(TRANSFORMED)


In [5]:
# Check type fact
df_type.head(10)

Unnamed: 0,district,property_type,avg_price,txn_count,year
0,TRAFFORD,T,51058,735,1995
1,BIRMINGHAM,T,42073,4832,1995
2,SUFFOLK COASTAL,F,42808,137,1995
3,PETERBOROUGH,D,77697,654,1995
4,COLCHESTER,D,96122,807,1995
5,EAST DORSET,D,109006,964,1995
6,TEST VALLEY,T,57882,455,1995
7,BABERGH,F,31060,66,1995
8,WILTSHIRE,F,38461,14,1995
9,ST MARY'S,D,162667,3,1995


In [6]:
# Check type fact
df_parq.head(10)

Unnamed: 0,transaction_id,price,transfer_datetime,postcode,property_type,new_build,tenure,paon,saon,street,locality,town,district,county,ppd_category,record_status,transfer_ts,transfer_date,year
0,{000FD2E3-A03F-4E53-922E-44E5D5DD89CC},36760,1995-12-18 00:00,MK410TW,T,Y,F,51,,POPPYFIELDS,BEDFORD,BEDFORD,NORTH BEDFORDSHIRE,BEDFORDSHIRE,A,A,1995-12-18 00:00:00,1995-12-18,1995
1,{01871C12-E430-48B5-A406-36CE616B1F8D},48500,1995-12-13 00:00,LS257QQ,T,N,F,55,,CROMWELL RISE,KIPPAX,LEEDS,LEEDS,WEST YORKSHIRE,A,A,1995-12-13 00:00:00,1995-12-13,1995
2,{001496E0-226A-4CD7-8C11-42FCD650EE35},72950,1995-04-27 00:00,WF26DW,S,N,F,6,,PLEDWICK GROVE,WAKEFIELD,WAKEFIELD,WAKEFIELD,WEST YORKSHIRE,A,A,1995-04-26 23:00:00,1995-04-27,1995
3,{01879071-92F9-406A-82A0-8B5A6CECB10E},58000,1995-06-30 00:00,NP78SY,S,N,F,8,,TROTHY WAY,LLANTILIO CROSSENNY,ABERGAVENNY,MONMOUTH,GWENT,A,A,1995-06-29 23:00:00,1995-06-30,1995
4,{0014B03D-4050-4394-8423-F68F64122393},50000,1995-05-12 00:00,NP206LE,S,N,L,74,,WAVELL DRIVE,NEWPORT,NEWPORT,NEWPORT,NEWPORT,A,A,1995-05-11 23:00:00,1995-05-12,1995
5,{0189C99B-659A-4DDA-8F86-25AF7709C20E},13000,1995-07-18 00:00,S755PQ,S,N,F,7,,HEDGE LANE,DARTON,BARNSLEY,BARNSLEY,SOUTH YORKSHIRE,A,A,1995-07-17 23:00:00,1995-07-18,1995
6,{0014F310-15F0-4193-BD39-3EA330F3D59A},53000,1995-02-10 00:00,L359LU,S,Y,L,10,,KIRKMAN FOLD,RAINHILL,PRESCOT,ST HELENS,MERSEYSIDE,A,A,1995-02-10 00:00:00,1995-02-10,1995
7,{018D6767-CC6D-4089-B94E-B16E57315297},63000,1995-10-13 00:00,OX169UG,S,N,F,14,,HAREWOOD ROAD,BANBURY,BANBURY,CHERWELL,OXFORDSHIRE,A,A,1995-10-12 23:00:00,1995-10-13,1995
8,{0019CC3A-2224-49D5-8437-51648D5CA4D8},51000,1995-03-31 00:00,SW170JG,F,N,L,6,,PEARTREE AVENUE,LONDON,LONDON,WANDSWORTH,GREATER LONDON,A,A,1995-03-30 23:00:00,1995-03-31,1995
9,{01962985-AA1C-48E1-80BF-C727B49B8C59},55000,1995-11-24 00:00,BH122DA,T,N,F,54,,ALBERT ROAD,POOLE,POOLE,POOLE,POOLE,A,A,1995-11-24 00:00:00,1995-11-24,1995


In [5]:


# Read raw CSV (headerless)
#df_csv = pd.read_csv(RAW, header=None, names=columns)
#df_full_csv = pd.read_csv(T_CSV)

# Save with headers
#df_csv.to_csv("ppdata_with_header.csv", index=False)


# Convert transfer_datetime to datetime
df_full_csv["transfer_datetime"] = pd.to_datetime(df_full_csv["transfer_datetime"], errors="coerce")


In [24]:
# Filter Colchester rows and in 2022 (there should be 4628 transactions)
check_ch = df_parq[(df_parq["town"].str.upper() == "COLCHESTER") & 
                   (df_parq["county"].str.upper() == "ESSEX") & 
                   (df_parq["year"] == 2022)]

check_ch.count()

transaction_id       4628
price                4628
transfer_datetime    4628
postcode             4628
property_type        4628
new_build            4628
tenure               4628
paon                 4628
saon                  438
street               4625
locality             2326
town                 4628
district             4628
county               4628
ppd_category         4628
record_status        4628
transfer_ts          4628
transfer_date        4628
year                 4628
dtype: int64

In [8]:
df_full_csv.count()

transaction_id       30365754
price                30365754
transfer_datetime    30365754
postcode             30316542
property_type        30365754
new_build            30365754
tenure               30365754
paon                 30361570
saon                  3626679
street               29883553
locality             18987870
town                 30365754
district             30365754
county               30365754
ppd_category         30365754
record_status        30365754
dtype: int64

In [22]:
df_parq.count()

transaction_id       29789443
price                29789443
transfer_datetime    29789443
postcode             29773506
property_type        29789443
new_build            29789443
tenure               29789443
paon                 29785259
saon                  3526921
street               29354290
locality             18750529
town                 29789443
district             29789443
county               29789443
ppd_category         29789443
record_status        29789443
transfer_ts          29789443
transfer_date        29789443
year                 29789443
dtype: int64

In [4]:
# Filter Manchester rows and only before 2025
scot = df_parq[(df_parq["district"].str.upper() == "HIGHLAND")]


# Show the result
display(scot)

Unnamed: 0,transaction_id,price,transfer_datetime,postcode,property_type,new_build,tenure,paon,saon,street,locality,town,district,county,ppd_category,record_status,transfer_ts,transfer_date,year


In [38]:
df_full["district"].unique()

array(['WIGAN', 'SHEPWAY', 'WOLVERHAMPTON', 'CEREDIGION', 'HOUNSLOW',
       'PORTSMOUTH', 'NORTH WEST LEICESTERSHIRE', 'SOUTH NORFOLK',
       'HORSHAM', 'ENFIELD', 'CASTLE POINT', 'BROXBOURNE', 'CARDIFF',
       'CROYDON', 'ROCHESTER UPON MEDWAY', 'REIGATE AND BANSTEAD',
       'AYLESBURY VALE', 'EREWASH', 'HAMMERSMITH AND FULHAM', 'FENLAND',
       'DARLINGTON', 'SEDGEFIELD', 'CRAWLEY', 'ADUR', 'MANCHESTER',
       'THREE RIVERS', 'ARUN', 'GATESHEAD', 'BARNET', 'HARINGEY',
       'MID DEVON', 'BARNSLEY', 'COTSWOLD', 'ROSSENDALE', 'BRECKLAND',
       'LEEDS', 'WAKEFIELD', 'PORT TALBOT', 'RESTORMEL', 'SANDWELL',
       'MAIDSTONE', 'SOUTH STAFFORDSHIRE', 'ROTHERHAM', 'CARADON',
       'WREXHAM MAELOR', 'BLACKBURN', 'KINGSWOOD', 'SOUTH KESTEVEN',
       'GLOUCESTER', 'RICHMOND UPON THAMES', 'CHARNWOOD', 'KENNET',
       'CHELTENHAM', 'BEXLEY', 'BURY', 'KERRIER', 'TAMESIDE',
       'WINCHESTER', 'EAST HERTFORDSHIRE', 'NORWICH', 'TOWER HAMLETS',
       'LEICESTER', 'STOKE-ON-TRENT', 'TAF

In [None]:
from config.streamlit_config import MART_FACT_BY_DISTRICT