In [9]:
"""
Script to load and filter the PPD data for the Midlands Counties only.

"""

import pandas as pd

csv_path = "../data/raw/pp-monthly-update-new-version.csv"

# read the CSV file, splitting on commas only if they are not inside double quotes.
df = pd.read_csv(
    csv_path,
    sep=r',(?=(?:[^"]*"[^"]*")*[^"]*$)',
    engine="python",
    header=0,
)
df.columns = df.columns.str.strip()

# clean string columns by trimming spaces and extra quotes
for col in df.select_dtypes(include="object"):
    df[col] = df[col].str.strip().str.strip(chr(34))

df.head()


Unnamed: 0,transaction,price,transfer_date,postcode,property_type,new_build,tenure,PAON,SAON,Street,Locality,town_city,district,county,PPD_category,Status
0,{3DCCB7CA-A2B4-5B9D-E063-4704A8C0331E},105000,2021-03-24 00:00,LS4 2BG,F,Y,L,1,FLAT 47,VIADUCT ROAD,,LEEDS,LEEDS,WEST YORKSHIRE,A,A
1,{3DCCB7CA-A2B5-5B9D-E063-4704A8C0331E},162000,2021-04-13 00:00,LS4 2BG,F,Y,L,1,FLAT 56,VIADUCT ROAD,,LEEDS,LEEDS,WEST YORKSHIRE,A,A
2,{3DCCB7CA-A2B6-5B9D-E063-4704A8C0331E},140000,2021-03-24 00:00,LS4 2BG,F,Y,L,1,FLAT 37,VIADUCT ROAD,,LEEDS,LEEDS,WEST YORKSHIRE,A,A
3,{3DCCB7CA-A2C1-5B9D-E063-4704A8C0331E},107000,2021-03-24 00:00,LS4 2BG,F,Y,L,1,FLAT 35,VIADUCT ROAD,,LEEDS,LEEDS,WEST YORKSHIRE,A,A
4,{3DCCB7CA-A3D7-5B9D-E063-4704A8C0331E},350000,2021-07-22 00:00,SS15 6HW,O,N,F,2,,MASTERS CRESCENT,LAINDON,BASILDON,BASILDON,ESSEX,B,A


In [10]:
# df shape
df.shape
df.info()
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90017 entries, 0 to 90016
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   transaction    90017 non-null  object
 1   price          90017 non-null  object
 2   transfer_date  90017 non-null  object
 3   postcode       90017 non-null  object
 4   property_type  90017 non-null  object
 5   new_build      90017 non-null  object
 6   tenure         90017 non-null  object
 7   PAON           90017 non-null  object
 8   SAON           90017 non-null  object
 9   Street         90017 non-null  object
 10  Locality       90017 non-null  object
 11  town_city      90017 non-null  object
 12  district       90017 non-null  object
 13  county         90017 non-null  object
 14  PPD_category   90017 non-null  object
 15  Status         90017 non-null  object
dtypes: object(16)
memory usage: 11.0+ MB


Unnamed: 0,transaction,price,transfer_date,postcode,property_type,new_build,tenure,PAON,SAON,Street,Locality,town_city,district,county,PPD_category,Status
count,90017,90017,90017,90017.0,90017,90017,90017,90017,90017.0,90017.0,90017.0,90017,90017,90017,90017,90017
unique,90017,7170,2388,77189.0,5,2,2,12650,1689.0,46365.0,7978.0,1123,319,114,2,3
top,{3DCCB7CA-A2B4-5B9D-E063-4704A8C0331E},250000,2025-07-25 00:00,,S,N,F,1,,,,LONDON,LEEDS,GREATER LONDON,A,A
freq,1,1096,3815,246.0,24151,81879,70315,2384,78721.0,1541.0,55227.0,6496,1145,10506,73067,85252


In [11]:
# check for missing values
df.isnull().sum()


transaction      0
price            0
transfer_date    0
postcode         0
property_type    0
new_build        0
tenure           0
PAON             0
SAON             0
Street           0
Locality         0
town_city        0
district         0
county           0
PPD_category     0
Status           0
dtype: int64

In [12]:
df.duplicated().sum()



0

In [None]:
# Check the number of unique values in each column
df.nunique()

transaction      90017
price             7170
transfer_date     2388
postcode         77189
property_type        5
new_build            2
tenure               2
PAON             12650
SAON              1689
Street           46365
Locality          7978
town_city         1123
district           319
county             114
PPD_category         2
Status               3
dtype: int64

In [15]:
df.shape
# what are the lengths of the postcodes?
df["postcode"].str.len().value_counts().sort_index()

postcode
0      246
6     2551
7    44876
8    42344
Name: count, dtype: int64

In [None]:
# drop rows with postcode null or length is 0
df = df[df["postcode"].notnull()]
df = df[df["postcode"].str.len() > 0]
df.shape

(89771, 16)

In [17]:
# focus on PPD category type A (standard sale) and status A (added)
df = df[(df["PPD_category"] == "A") & (df["Status"] == "A")]
df.shape

(70258, 16)

In [18]:
# # duplicates removal
# dups if postcode, PAON, SAON, street, locality, town, city, county are the same
# find these and display them
duplicates = df[df.duplicated(subset=["price", "postcode", "PAON", "SAON", "Street", "Locality", "town_city", "county"], keep=False)]
# order by postcode
duplicates = duplicates.sort_values(by=["postcode"])
duplicates

Unnamed: 0,transaction,price,transfer_date,postcode,property_type,new_build,tenure,PAON,SAON,Street,Locality,town_city,district,county,PPD_category,Status
17619,{3DCCB7C9-F091-5B9D-E063-4704A8C0331E},740000,2025-03-28 00:00,AL6 0DB,D,N,F,32,,HERTFORD ROAD,DIGSWELL,WELWYN,WELWYN HATFIELD,HERTFORDSHIRE,A,A
17618,{3DCCB7C9-F090-5B9D-E063-4704A8C0331E},740000,2025-03-28 00:00,AL6 0DB,D,N,F,32,,HERTFORD ROAD,DIGSWELL,WELWYN,WELWYN HATFIELD,HERTFORDSHIRE,A,A
62801,{3DCCB7CA-898E-5B9D-E063-4704A8C0331E},185000,2025-08-01 00:00,B11 3AG,T,N,F,82,,SOLIHULL ROAD,SPARKHILL,BIRMINGHAM,BIRMINGHAM,WEST MIDLANDS,A,A
62802,{3DCCB7CA-898F-5B9D-E063-4704A8C0331E},185000,2025-08-01 00:00,B11 3AG,T,N,F,82,,SOLIHULL ROAD,SPARKHILL,BIRMINGHAM,BIRMINGHAM,WEST MIDLANDS,A,A
56269,{3DCCB7CA-83F5-5B9D-E063-4704A8C0331E},300000,2025-06-05 00:00,B11 4EY,D,N,L,119,,OAKWOOD ROAD,SPARKHILL,BIRMINGHAM,BIRMINGHAM,WEST MIDLANDS,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76005,{3DCCB7CA-4F27-5B9D-E063-4704A8C0331E},213000,2025-05-23 00:00,WS7 0EW,S,N,F,20,,HUDSON DRIVE,,BURNTWOOD,LICHFIELD,STAFFORDSHIRE,A,A
25694,{3DCCB7CA-262A-5B9D-E063-4704A8C0331E},230000,2024-08-22 00:00,WV11 3LD,T,N,F,9,,HOLBERG GROVE,WEDNESFIELD,WOLVERHAMPTON,WOLVERHAMPTON,WEST MIDLANDS,A,A
10982,{3DCCB7CA-87E4-5B9D-E063-4704A8C0331E},230000,2024-08-22 00:00,WV11 3LD,T,N,L,9,,HOLBERG GROVE,WEDNESFIELD,WOLVERHAMPTON,WOLVERHAMPTON,WEST MIDLANDS,A,A
58047,{3DCCB7CA-8D2A-5B9D-E063-4704A8C0331E},220000,2025-07-18 00:00,WV12 5NW,S,N,F,8,,CORSICAN CLOSE,,WILLENHALL,WALSALL,WEST MIDLANDS,A,A


In [19]:
# remove duplicates, keeping the last occurrence
df = df.drop_duplicates(
    subset=["price", "postcode", "PAON", "SAON", "Street", "Locality", "town_city", "county"],
    keep="last"
)
df.shape

(70168, 16)

In [20]:
# Use these Midlands counties only
counties_to_use = [
    "STOKE-ON-TRENT",
    "CITY OF DERBY",
    "DERBYSHIRE",
    "WEST MIDLANDS",
    "STAFFORDSHIRE",
    "LEICESTERSHIRE",
    "LEICESTER",
    "WORCESTERSHIRE",
    "WARWICKSHIRE",
]

# filter to keep only these counties
df = df[df["county"].isin(counties_to_use)]
df.shape

(8231, 16)

In [21]:
df.to_csv("../data/clean/pp-monthly-cleaned.csv", index=False)