## Capstone Two - DC Housing Prices Predicting: Pre-processing & Training Data Development

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the dataset
df = pd.read_csv('DC_Properties.csv', low_memory=False)

# Basic cleaning
cols_to_drop = ['Unnamed: 0', 'SOURCE', 'CMPLX_NUM', 'GIS_LAST_MOD_DTTM',
                'SQUARE', 'X', 'Y', 'NATIONALGRID']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
df = df[df['PRICE'] > 0]
df = df.drop_duplicates()

df.shape

(97832, 41)

In [4]:
# Select numeric and categorical features for encoding
df_model = df[['PRICE', 'ROOMS', 'BEDRM', 'BATHRM', 'LANDAREA', 'WARD', 'HEAT', 'STRUCT']].copy()
df_model = df_model.dropna(subset=['WARD', 'HEAT', 'STRUCT'])
df_model = df_model[df_model['WARD'].notnull()]
df_model.head()

Unnamed: 0,PRICE,ROOMS,BEDRM,BATHRM,LANDAREA,WARD,HEAT,STRUCT
0,1095000.0,8,4,4,1680,Ward 2,Warm Cool,Row Inside
2,2100000.0,9,5,3,1680,Ward 2,Hot Water Rad,Row Inside
3,1602000.0,8,5,3,1680,Ward 2,Hot Water Rad,Row Inside
5,1950000.0,10,5,3,2196,Ward 2,Hot Water Rad,Row Inside
7,1050000.0,8,4,3,1627,Ward 2,Hot Water Rad,Row Inside


In [5]:
# Dummy encoding for categorical features
df_encoded = pd.get_dummies(df_model, columns=['WARD', 'HEAT', 'STRUCT'], drop_first=True)
df_encoded.head()

Unnamed: 0,PRICE,ROOMS,BEDRM,BATHRM,LANDAREA,WARD_Ward 2,WARD_Ward 3,WARD_Ward 4,WARD_Ward 5,WARD_Ward 6,...,HEAT_Wall Furnace,HEAT_Warm Cool,HEAT_Water Base Brd,STRUCT_Multi,STRUCT_Row End,STRUCT_Row Inside,STRUCT_Semi-Detached,STRUCT_Single,STRUCT_Town End,STRUCT_Town Inside
0,1095000.0,8,4,4,1680,True,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
2,2100000.0,9,5,3,1680,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,1602000.0,8,5,3,1680,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
5,1950000.0,10,5,3,2196,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
7,1050000.0,8,4,3,1627,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [6]:
# Separate predictors (X) and target (y)
X = df_encoded.drop(columns='PRICE')
y = df_encoded['PRICE']

In [7]:
# Standardize numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled.head()

Unnamed: 0,ROOMS,BEDRM,BATHRM,LANDAREA,WARD_Ward 2,WARD_Ward 3,WARD_Ward 4,WARD_Ward 5,WARD_Ward 6,WARD_Ward 7,...,HEAT_Wall Furnace,HEAT_Warm Cool,HEAT_Water Base Brd,STRUCT_Multi,STRUCT_Row End,STRUCT_Row Inside,STRUCT_Semi-Detached,STRUCT_Single,STRUCT_Town End,STRUCT_Town Inside
0,0.239203,0.516432,1.689195,-0.474329,4.005516,-0.397467,-0.468225,-0.445048,-0.44605,-0.39487,...,-0.038344,1.756709,-0.040326,-0.227199,-0.368009,1.213126,-0.406453,-0.624579,-0.036492,-0.056617
1,0.672668,1.411132,0.748974,-0.474329,4.005516,-0.397467,-0.468225,-0.445048,-0.44605,-0.39487,...,-0.038344,-0.569246,-0.040326,-0.227199,-0.368009,1.213126,-0.406453,-0.624579,-0.036492,-0.056617
2,0.239203,1.411132,0.748974,-0.474329,4.005516,-0.397467,-0.468225,-0.445048,-0.44605,-0.39487,...,-0.038344,-0.569246,-0.040326,-0.227199,-0.368009,1.213126,-0.406453,-0.624579,-0.036492,-0.056617
3,1.106132,1.411132,0.748974,-0.310403,4.005516,-0.397467,-0.468225,-0.445048,-0.44605,-0.39487,...,-0.038344,-0.569246,-0.040326,-0.227199,-0.368009,1.213126,-0.406453,-0.624579,-0.036492,-0.056617
4,0.239203,0.516432,0.748974,-0.491166,4.005516,-0.397467,-0.468225,-0.445048,-0.44605,-0.39487,...,-0.038344,-0.569246,-0.040326,-0.227199,-0.368009,1.213126,-0.406453,-0.624579,-0.036492,-0.056617


In [8]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((46319, 31), (11580, 31), (46319,), (11580,))


## Summary of Pre-processing Steps

- Selected both numerical and categorical variables relevant to house pricing.
- Created dummy variables for `WARD`, `HEAT`, and `STRUCT` to convert categorical features.
- Applied standard scaling to the predictors to normalize the feature ranges.
- Split the dataset into training and testing subsets using an 80/20 ratio.
