## Code for preprocessing steps of the raw dataset

### Only including entries for town "BEDOK"

In [5]:
import pandas as pd

input_file_path = "../data/raw/ALL_Prices_1990_2021_mar.csv"
output_file_path = "../data/processed/bedok_processed.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Filter the DataFrame for entries where the town is "BEDOK"
df_bedok = df[df["town"] == "BEDOK"].copy()

# Create a timestamp column by converting the "month" column to datetime.
df_bedok["timestamp"] = pd.to_datetime(df_bedok["month"], format="%Y-%m")

# Select only the necessary columns: the timestamp and the price per square meter
df_bedok = df_bedok[["timestamp", "price_psm"]]

# Set the timestamp column as the DataFrame index for resampling
df_bedok.set_index("timestamp", inplace=True)

# Aggregate the data to get a single value per month (using the mean price per sqm)
df_monthly = df_bedok.resample("MS").mean()

# Reset the index so that timestamp is a column again
df_monthly.reset_index(inplace=True)

# Save the aggregated monthly data to a new CSV file in the preprocessed folder
df_monthly.to_csv(output_file_path, index=False, date_format="%Y-%m")

print(f"Preprocessed data saved to {output_file_path}")

Preprocessed data saved to ../data/processed/bedok_processed.csv
