In [None]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
data = pd.read_csv("Datasets/IOT-temp.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
#The example dataset provided had no significant missing values. To properly demonstrate data cleaning, missing value imputation, and outlier handling techniques, I chose the Bangalore House Price dataset which contains missing values, categorical variables, and extreme outliers.
data1 = pd.read_csv("Datasets/BHP.csv")
data1.head()

In [None]:
data1.info()

In [None]:
data1.describe()

In [None]:
data1.isna().sum()

In [None]:
#since society has a lot of missing values: dropping it
data1.drop(columns=["society"], inplace=True)

In [None]:
#Filling location with Unknown since only 1 value is missing:
data1["location"].isna().sum()
data1["location"] = data1["location"].fillna("Unknown")

In [None]:
data1["location"].isna().sum()

In [None]:
#removing BHK from size and filling all missing values with median and renaming column to bhk
data1['bhk'] = data1['size'].str.extract('(\d+)').astype(float)
data1.drop(columns=['size'], inplace=True)
data1['bhk'] = data1['bhk'].fillna(data1['bhk'].median())

In [None]:
data1.isna().sum()

In [None]:
#Using median to fill missing values in size, bath and balcony because mean is sensitive to outliers, extremene values and size and bath contains
#a lot of outliers. Using mode is not preferred since size, bath and balcony are numerical values and mode is less suitable for it

In [None]:
data1['bath'] = data1['bath'].fillna(data1['bath'].median())

In [None]:
data1.isna().sum()

In [None]:
data1['balcony'] = data1['balcony'].fillna(data1['balcony'].median())

In [None]:
data1.isna().sum()

In [None]:
#Outlier detection and Removal using IQR Method
q1 = data1['price'].quantile(0.25)
q3 = data1['price'].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
data1 = data1[(data1['price'] >= lower) & (data1['price'] <= upper)]

In [None]:
#Removing unrealistic bath values:
data1 = data1[data1['bath'] < 10]

In [None]:
#Outlier detection and Removal using Z-Score
z_score = np.abs(stats.zscore(data1[['price', 'bath', 'balcony']]))
data1 = data1[(z_score < 3).all(axis=1)]

In [None]:
#Verification:
data1.isna().sum()

In [None]:
data1.describe()

In [None]:
data1.shape

In [None]:
data1.head()

In [None]:
#Saving Cleaned dataset for Task2:
data1.to_csv("Datasets/data_clean.csv", index=False)