In [None]:
# preprocessing_data.ipynb
# Preprocessing Toronto Home Price Dataset

# -- Imports --
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# -- Load Dataset --
import kagglehub

# Download and set path
path = kagglehub.dataset_download("alankmwong/toronto-home-price-index")

In [None]:
# Read CSV
file_path = path + "/toronto_home_price_index.csv"
df = pd.read_csv(file_path)

print("Dataset loaded successfully.")

# -- Basic Exploration --
print("\nFirst few rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

# -- Clean Missing Values --
print("\nChecking for missing values:")
print(df.isnull().sum())

# Drop or fill missing values if any
# (This dataset has very clean records, but still validate)

# Example: Drop rows with missing data (if any)
df_clean = df.dropna()

print("\nAfter cleaning, dataset shape:", df_clean.shape)

# -- Feature Engineering (if needed) --
# Example: Create 'Year' and 'Month' columns from Date
if 'Date' in df_clean.columns:
    df_clean['Date'] = pd.to_datetime(df_clean['Date'])
    df_clean['Year'] = df_clean['Date'].dt.year
    df_clean['Month'] = df_clean['Date'].dt.month

# -- Visualization: Price Trends --

# Assuming columns like 'HPI' (Home Price Index)
plt.figure(figsize=(14, 6))
sns.lineplot(x='Date', y='HPI', data=df_clean)
plt.title('Toronto Home Price Index Over Time')
plt.xlabel('Date')
plt.ylabel('Home Price Index')
plt.grid(True)
plt.show()

# -- Data Preparation for Modeling --

# Select features and target
features = ['Year', 'Month']

# Optional: Add macroeconomic or external features later

target = 'HPI'

X = df_clean[features]
y = df_clean[target]

print("\nFeatures and target prepared for modeling.")
print("X shape:", X.shape)
print("y shape:", y.shape)

# -- Save processed data --
processed_data_path = "data/processed_toronto_hpi.csv"
df_clean.to_csv(processed_data_path, index=False)
print(f"Cleaned dataset saved to: {processed_data_path}")
