# 2. Data Cleaning — UK Housing
**Worked on by:** Marin Janushaj  
**Goal:** Clean and prepare the raw housing data for analysis and modeling.

In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/price_paid_records.csv", low_memory=False)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22489348 entries, 0 to 22489347
Data columns (total 11 columns):
 #   Column                             Dtype 
---  ------                             ----- 
 0   transaction_unique_identifier      object
 1   price                              int64 
 2   date_of_transfer                   object
 3   property_type                      object
 4   old/new                            object
 5   duration                           object
 6   town/city                          object
 7   district                           object
 8   county                             object
 9   ppdcategory_type                   object
 10  record_status_-_monthly_file_only  object
dtypes: int64(1), object(10)
memory usage: 1.8+ GB


In [2]:
df["date_of_transfer"] = pd.to_datetime(df["date_of_transfer"], errors="coerce")

In [3]:
df["date_of_transfer"].head()

0   1995-08-18
1   1995-08-09
2   1995-06-30
3   1995-11-24
4   1995-06-23
Name: date_of_transfer, dtype: datetime64[ns]

In [4]:
df = df.rename(columns={
    "transaction_unique_identifier": "transaction_id",
    "date_of_transfer": "date",
    "property_type": "type",
    "old/new": "is_new",
    "duration": "duration",
    "town/city": "town",
    "district": "district",
    "county": "county",
    "ppdcategory_type": "category",
    "record_status_-_monthly_file_only": "record_status"
})

In [5]:
df.columns

Index(['transaction_id', 'price', 'date', 'type', 'is_new', 'duration', 'town',
       'district', 'county', 'category', 'record_status'],
      dtype='object')

In [6]:
df.isna().sum()

transaction_id    0
price             0
date              0
type              0
is_new            0
duration          0
town              0
district          0
county            0
category          0
record_status     0
dtype: int64

In [7]:
df.to_parquet("../data/clean/uk_housing_clean.parquet", index=False)
print("✅ Cleaned dataset saved!")

✅ Cleaned dataset saved!


In [9]:
ls -lh ../data/clean/

total 1736920
-rw-r--r--  1 marinjanushaj  staff    34B Nov  7 01:29 README.md
-rw-r--r--@ 1 marinjanushaj  staff   835M Nov  7 02:36 uk_housing_clean.parquet


In [10]:
test = pd.read_parquet("../data/clean/uk_housing_clean.parquet")
test.head()

Unnamed: 0,transaction_id,price,date,type,is_new,duration,town,district,county,category,record_status
0,{81B82214-7FBC-4129-9F6B-4956B4A663AD},25000,1995-08-18,T,N,F,OLDHAM,OLDHAM,GREATER MANCHESTER,A,A
1,{8046EC72-1466-42D6-A753-4956BF7CD8A2},42500,1995-08-09,S,N,F,GRAYS,THURROCK,THURROCK,A,A
2,{278D581A-5BF3-4FCE-AF62-4956D87691E6},45000,1995-06-30,T,N,F,HIGHBRIDGE,SEDGEMOOR,SOMERSET,A,A
3,{1D861C06-A416-4865-973C-4956DB12CD12},43150,1995-11-24,T,N,F,BEDFORD,NORTH BEDFORDSHIRE,BEDFORDSHIRE,A,A
4,{DD8645FD-A815-43A6-A7BA-4956E58F1874},18899,1995-06-23,S,N,F,WAKEFIELD,LEEDS,WEST YORKSHIRE,A,A
