In [1]:
import pandas as pd

# Step 1: Load Raw Data
df = pd.read_csv("../data/raw/house_data.csv")

In [2]:
# Step 2: Initial Inspection
print("Initial data shape:", df.shape)
print(df.tail())

Initial data shape: (84, 7)
      price  sqft  bedrooms  bathrooms    location  year_built  condition
79   530000  2080         3        2.0       Urban        1991       Good
80   372000  1640         2        1.5      Suburb        1963       Fair
81   592000  2220         3        2.0    Downtown        1985       Good
82   328000  1730         2        1.5       Rural        1965       Fair
83  1190000  3170         4        3.5  Waterfront        2006  Excellent


In [3]:
# Step 3: Drop malformed rows (like the one with missing columns)
df = df.dropna()

In [4]:
# Step 4: Data Type Conversion
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['sqft'] = pd.to_numeric(df['sqft'], errors='coerce')
df['bedrooms'] = pd.to_numeric(df['bedrooms'], errors='coerce')
df['bathrooms'] = pd.to_numeric(df['bathrooms'], errors='coerce')
df['year_built'] = pd.to_numeric(df['year_built'], errors='coerce')


In [5]:
# Step 5: Handle Missing or Corrupted Values
df = df.dropna()
df = df[df['price'] > 10000]  # basic sanity checks
df = df[df['sqft'] > 200]

In [6]:
# Step 6: Standardize Categorical Columns
df['location'] = df['location'].str.strip().str.title()
df['condition'] = df['condition'].str.strip().str.title()


### Next Step 

Run the Data Processing Script as 

```
python src/data/run_processing.py --input data/raw/house_data.csv --output data/processed/cleaned_house_data.csv 
```
