### **1. Load Data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from tqdm.auto import tqdm
from IPython.display import display, HTML
from scipy.stats import norm, beta
sns.set()
plt.style.use('seaborn-poster')

home = pd.read_csv('property.csv')
display(home.head())

print(home.shape)

  plt.style.use('seaborn-poster')


Unnamed: 0,item_id,deposit,monthly_rent,district_uuid,room_qty,unit_area,has_elevator,building_floor_count,unit_floor,has_storage_area,property_age
0,91c0e569-bddd-4128-9720-2550bb85580e,64800000.0,0,263682f6-d0cd-4569-aeec-e727b76b7665,1.0,42,False,3.0,0.0,True,23.0
1,b00b7919-06be-4d26-98b8-1971787e1d46,72000000.0,4320000,97c9535e-3985-47ce-a84c-a962c838a76b,2.0,116,True,,1.0,True,16.0
2,9eddb6bc-e424-4774-b55f-bfd54366d627,50400000.0,1440000,b790f536-c274-4147-86e0-94d9b6d7352d,1.0,74,False,2.0,0.0,True,19.0
3,12cf6b07-5d56-4126-94d2-ce9cbfe2214f,36000000.0,864000,93d06676-4975-4cc5-919b-3a0c29c7ad43,1.0,60,True,,2.0,False,6.0
4,929eb20c-3694-46b2-b96c-91117b995d1b,28800000.0,1296000,58e59fa9-9947-478f-9cef-bc6a2cbe49a9,1.0,45,True,,1.0,True,4.0


(4930, 11)


### **2. Preporocessing**

Drop Nan and creating new df clean_home

In [None]:
clean_home = (home
              .loc[~((home['unit_area'] == 0)|(home['room_qty'] == 0))]
              .dropna(subset=['district_uuid',
                              'room_qty',
                              'property_age',
                              'deposit',
                              'has_storage_area',
                              'has_elevator',
                              'unit_floor'])
             )


In [None]:
# Assuming 'clean_home' is your DataFrame
clean_home = pd.get_dummies(clean_home, columns=['has_elevator', 'has_storage_area'], drop_first=True)


In [None]:
!pip install fancyimpute


Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=bb4e50875d4c794c63a3090140235e9ab3e76f90542a3aba32c9cbaadcfe9537
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

Using KNN to Impute building_floor_count

In [None]:
import fancyimpute
# select only numrical columns
X = clean_home[[c for c in clean_home.columns if c not in ['item_id','district_uuid']]].copy()

# Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = fancyimpute.KNN(k=3).fit_transform(X)

# round the number
clean_home['imputing_building_floor_count'] = X_filled_knn[:,5]

Imputing row 1/4783 with 0 missing, elapsed time: 3.758
Imputing row 101/4783 with 0 missing, elapsed time: 3.763
Imputing row 201/4783 with 0 missing, elapsed time: 3.766
Imputing row 301/4783 with 1 missing, elapsed time: 3.768
Imputing row 401/4783 with 1 missing, elapsed time: 3.771
Imputing row 501/4783 with 0 missing, elapsed time: 3.774
Imputing row 601/4783 with 1 missing, elapsed time: 3.777
Imputing row 701/4783 with 0 missing, elapsed time: 3.780
Imputing row 801/4783 with 1 missing, elapsed time: 3.785
Imputing row 901/4783 with 1 missing, elapsed time: 3.788
Imputing row 1001/4783 with 1 missing, elapsed time: 3.792
Imputing row 1101/4783 with 0 missing, elapsed time: 3.798
Imputing row 1201/4783 with 1 missing, elapsed time: 3.802
Imputing row 1301/4783 with 0 missing, elapsed time: 3.806
Imputing row 1401/4783 with 0 missing, elapsed time: 3.809
Imputing row 1501/4783 with 0 missing, elapsed time: 3.810
Imputing row 1601/4783 with 0 missing, elapsed time: 3.813
Imputing 

### **Categorize Features ( Feature Engineering)**
convert every feature to categorical features as interval bin

In [None]:
import math
from pandas.api.types import CategoricalDtype

# Assuming 'clean_home' is your DataFrame

# Binning for 'unit_area'
bins = [0, 50, 100, 150, float('inf')]
labels = ['Small', 'Medium', 'Large', 'Extra Large']
clean_home['unit_area_category'] = pd.cut(clean_home['unit_area'], bins=bins, labels=labels, right=False).astype(str)

# Binning for 'property_age'
age_bins = [0, 10, 20, 30, 40, 50, float('inf')]
age_labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '50+']
clean_home['property_age_category'] = pd.cut(clean_home['property_age'], bins=age_bins, labels=age_labels, right=False).astype(str)

# Convert categorical columns to CategoricalDtype
category_columns = ['unit_area_category', 'property_age_category']
for col in category_columns:
    clean_home[col] = clean_home[col].astype(CategoricalDtype())

# Display the updated DataFrame
display(clean_home.head())


Unnamed: 0,item_id,deposit,monthly_rent,district_uuid,room_qty,unit_area,has_elevator,building_floor_count,unit_floor,has_storage_area,property_age,imputing_building_floor_count,unit_area_category,property_age_category
0,91c0e569-bddd-4128-9720-2550bb85580e,64800000.0,0,263682f6-d0cd-4569-aeec-e727b76b7665,1.0,42,False,3.0,0.0,True,23.0,3.0,Small,21-30
1,b00b7919-06be-4d26-98b8-1971787e1d46,72000000.0,4320000,97c9535e-3985-47ce-a84c-a962c838a76b,2.0,116,True,,1.0,True,16.0,3.433408,Large,11-20
2,9eddb6bc-e424-4774-b55f-bfd54366d627,50400000.0,1440000,b790f536-c274-4147-86e0-94d9b6d7352d,1.0,74,False,2.0,0.0,True,19.0,2.0,Medium,11-20
3,12cf6b07-5d56-4126-94d2-ce9cbfe2214f,36000000.0,864000,93d06676-4975-4cc5-919b-3a0c29c7ad43,1.0,60,True,,2.0,False,6.0,2.361446,Medium,0-10
4,929eb20c-3694-46b2-b96c-91117b995d1b,28800000.0,1296000,58e59fa9-9947-478f-9cef-bc6a2cbe49a9,1.0,45,True,,1.0,True,4.0,3.931093,Small,0-10
