# Read data from csv file

In [30]:
import pandas as pd
import numpy as np

df = pd.read_csv("Melbourne_Housing.csv")

df.shape

  df = pd.read_csv("Melbourne_Housing.csv")


(34857, 22)

# Check missing data in csv file

In [31]:
def check_missing_value(data):
    missing_like_values = [
        "NaN", "nan", "NA", "N/A", "na", "null", "Null", "NULL",
        "missing", "Missing", "MISSING", "inf", "Inf", "INF", "∞", "-inf"
    ]
    # replace all missing value to NaN
    df = data.replace(missing_like_values, np.nan)
    df = df.replace([np.inf, -np.inf], np.nan)

    # check how many column with mssing values
    missing_count = df.isna().sum()
    missing_percent = (missing_count / len(df)) * 100

    missing_table = (
        pd.DataFrame({
            "Missing Count": missing_count[missing_count > 0],
            "Missing %": missing_percent[missing_percent > 0].round(2)
        })
        .sort_values(by="Missing %", ascending=False)
    )

    print("The number of column with missing values", len(missing_table))
    print(missing_table.head(df.shape[1]))

check_missing_value(df)

The number of column with missing values 13
               Missing Count  Missing %
BuildingArea           21115      60.58
YearBuilt              19306      55.39
Landsize               11810      33.88
Car                     8728      25.04
Bathroom                8226      23.60
Bedroom                 8217      23.57
Latitude                7976      22.88
Longtitude              7976      22.88
Price                   7610      21.83
CouncilArea                3       0.01
Propertycount              3       0.01
Distance                   1       0.00
Postcode                   1       0.00


# Delete all data with missing value for councilarea,propertycount,distance,postcode and price

In [32]:
cols_to_drop_na = ["Price", "CouncilArea", "Propertycount", "Distance", "Postcode"]
df = df.dropna(subset=cols_to_drop_na)

check_missing_value(df)
df.shape

The number of column with missing values 8
              Missing Count  Missing %
BuildingArea          16588      60.89
YearBuilt             15160      55.65
Landsize               9262      34.00
Car                    6821      25.04
Bathroom               6444      23.65
Bedroom                6438      23.63
Latitude               6251      22.94
Longtitude             6251      22.94


(27244, 22)

# Using mean value of Latitude and Longtiitude in same suburb and regionname fill in the msissing value

In [33]:
df["Latitude"] = df.groupby(["Suburb", "Regionname"])["Latitude"].transform(
    lambda x: x.fillna(x.mean())
)

df["Longtitude"] = df.groupby(["Suburb", "Regionname"])["Longtitude"].transform(
    lambda x: x.fillna(x.mean())
)

# some combination still have missing value
df["Latitude"].fillna(df["Latitude"].mean(), inplace=True)
df["Longtitude"].fillna(df["Longtitude"].mean(), inplace=True)

check_missing_value(df)


The number of column with missing values 6
              Missing Count  Missing %
BuildingArea          16588      60.89
YearBuilt             15160      55.65
Landsize               9262      34.00
Car                    6821      25.04
Bathroom               6444      23.65
Bedroom                6438      23.63


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Latitude"].fillna(df["Latitude"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Longtitude"].fillna(df["Longtitude"].mean(), inplace=True)


# Fill in missing value in bedroom and Bathroom

In [34]:
# using rooms number fill in missing value in bedroom
df["Bedroom"].fillna(df["Rooms"], inplace=True)

# group by bedroom，calculate avg for bathroom
bedroom_group_mean = df.groupby("Bedroom")["Bathroom"].mean()

print(bedroom_group_mean)

df["Bathroom"] = df.groupby(["Bedroom"])["Bathroom"].transform(
    lambda x: x.fillna(x.mean())
)

check_missing_value(df)

Bedroom
0.0     0.937500
1.0     1.015385
2.0     1.142036
3.0     1.487890
4.0     2.058564
5.0     2.685336
6.0     3.054545
7.0     3.000000
8.0     3.777778
9.0     7.000000
10.0    6.000000
12.0    5.000000
16.0    8.000000
20.0    1.000000
Name: Bathroom, dtype: float64
The number of column with missing values 4
              Missing Count  Missing %
BuildingArea          16588      60.89
YearBuilt             15160      55.65
Landsize               9262      34.00
Car                    6821      25.04


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Bedroom"].fillna(df["Rooms"], inplace=True)


In [35]:
df.to_csv("cleaned_melbourne_housing.csv", index=False)