# Divide date to dayofmonth, year, month

In [62]:
import pandas as pd

df = pd.read_csv("cleaned_melbourne_housing.csv")

df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

df = df.drop(columns=["Date"])

# Mapping suburb to suburb_id

In [63]:
df["Suburb_ID"], suburb_index = pd.factorize(df["Suburb"])

suburb_map = pd.DataFrame({
    "Suburb": suburb_index,
    "Suburb_ID": range(len(suburb_index))
})


df = df.drop(columns=["Suburb"])

In [64]:
# divide address to unitnumber and streettype
df["UnitNumber"] = df["Address"].str.extract(r"^([\dA-Za-z/]+)")
# df["StreetType"] = df["Address"].str.extract(r"\b([A-Za-z]+)$")


df = df.drop(columns=["Address"])

In [65]:
# divide region to north east south and west

region_map = {
    "Eastern Metropolitan": "East",
    "South-Eastern Metropolitan": "South",
    "Southern Metropolitan": "South",
    "Western Metropolitan": "West",
    "Northern Metropolitan": "North",
    "Eastern Victoria": "East",
    "Western Victoria": "West",
    "Northern Victoria": "North"
}
df["Direction"] = df["Regionname"].map(region_map)

# One hot to deal with CouncilArea,Method,Regionname and ParkingArea. Drop Address and SellerG features

In [66]:
df = pd.get_dummies(df, columns=["CouncilArea"], prefix="CouncilArea")
df = pd.get_dummies(df, columns=["Method"], prefix="Method")
df = pd.get_dummies(df, columns=["Regionname"], prefix="Regionname")
df = pd.get_dummies(df, columns=["ParkingArea"], prefix="ParkingArea")
# df = pd.get_dummies(df, columns=["StreetType"], prefix="StreetType")
df = pd.get_dummies(df, columns=["Direction"])

df = df.drop(columns=["SellerG"])

df.shape


(27244, 94)

# Using polynomial to constract features

In [67]:
from sklearn.preprocessing import PolynomialFeatures

poly_cols = ['Rooms', 'Bedroom', 'Bathroom', 'Distance','Propertycount']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[poly_cols])
poly_feature_names = poly.get_feature_names_out(poly_cols)

df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)
df_poly = df_poly.drop(columns=poly_cols)
df = pd.concat([df, df_poly], axis=1)

df.shape[1]

109

# Convert all bool values to int and save the csv file

In [68]:
df["Type_h"].dtype == bool

features = df.columns

for f in features:
    if df[f].dtype == bool:
        df[f] = df[f].astype(int)


        
df.to_csv("dataset.csv",index=False)

df.shape

(27244, 109)