In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head() # converted encoded categories back to normal so that we do the encoding within a scikit learn pipeline

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 24,4.25,3,4,3+,Old Property,3201.0,1,0,1,Medium,Mid Floor
1,flat,sector 70,1.18,2,2,3,New Property,1400.0,0,0,0,Medium,Mid Floor
2,flat,sector 77,0.71,2,2,2,Relatively New,1057.0,0,0,0,Medium,Mid Floor
3,flat,sector 95,0.2,1,1,1,Relatively New,407.0,0,0,0,Medium,Mid Floor
4,flat,sector 90,0.9,3,4,3,Relatively New,1765.0,1,0,0,High,Mid Floor


In [4]:
df.shape

(3590, 13)

In [5]:
df['furnishing_type'].value_counts()

Unnamed: 0_level_0,count
furnishing_type,Unnamed: 1_level_1
0,2651
1,762
2,177


In [6]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 24,4.25,3,4,3+,Old Property,3201.0,1,0,semifurnished,Medium,Mid Floor
1,flat,sector 70,1.18,2,2,3,New Property,1400.0,0,0,unfurnished,Medium,Mid Floor
2,flat,sector 77,0.71,2,2,2,Relatively New,1057.0,0,0,unfurnished,Medium,Mid Floor
3,flat,sector 95,0.2,1,1,1,Relatively New,407.0,0,0,unfurnished,Medium,Mid Floor
4,flat,sector 90,0.9,3,4,3,Relatively New,1765.0,1,0,unfurnished,High,Mid Floor


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3590 entries, 0 to 3589
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3590 non-null   object 
 1   sector           3590 non-null   object 
 2   price            3590 non-null   float64
 3   bedRoom          3590 non-null   int64  
 4   bathroom         3590 non-null   int64  
 5   balcony          3590 non-null   object 
 6   agePossession    3590 non-null   object 
 7   built_up_area    3590 non-null   float64
 8   servant room     3590 non-null   int64  
 9   store room       3590 non-null   int64  
 10  furnishing_type  3590 non-null   object 
 11  luxury_category  3590 non-null   object 
 12  floor_category   3587 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 364.7+ KB


In [9]:
df.isnull().sum()

Unnamed: 0,0
property_type,0
sector,0
price,0
bedRoom,0
bathroom,0
balcony,0
agePossession,0
built_up_area,0
servant room,0
store room,0


In [10]:
df = df[~df['floor_category'].isnull()]

In [11]:
df.isnull().sum()

Unnamed: 0,0
property_type,0
sector,0
price,0
bedRoom,0
bathroom,0
balcony,0
agePossession,0
built_up_area,0
servant room,0
store room,0


In [12]:
x = df.drop(columns=['price'])
y = df['price']

In [13]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [14]:
# ordinal encoding

In [15]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [16]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

In [17]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [18]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [19]:
scores.mean(),scores.std()

(0.7383593038936562, 0.041401571460814524)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [21]:
pipeline.fit(x_train,y_train)

In [22]:
y_pred = pipeline.predict(x_test)

In [23]:
y_pred = np.expm1(y_pred)

In [24]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.8172892906600077

In [25]:
# convert the above flow and create a function
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(x_train,y_train)

    y_pred = pipeline.predict(x_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [26]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [27]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [28]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [29]:
model_df.sort_values(['mae']) # tree based models are not affected by ordinal encoding

Unnamed: 0,name,r2,mae
5,random forest,0.883703,0.445337
10,xgboost,0.894525,0.450397
7,gradient boosting,0.875687,0.501519
6,extra trees,0.870956,0.525856
4,decision tree,0.785063,0.582777
9,mlp,0.810491,0.681147
1,svr,0.767274,0.774009
0,linear_reg,0.738359,0.817289
2,ridge,0.738362,0.817326
8,adaboost,0.758386,0.823726


In [30]:
# one hot encoding

In [31]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        # we are not one hot encoding all cols because there are cols which has order
        # example -> number of blacony -> 1>2>3
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ],
    remainder='passthrough'
)

In [32]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [33]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [34]:
scores.mean(), scores.std()

(0.8567715573342702, 0.024307598399341296)

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [36]:
pipeline.fit(x_train,y_train)

In [37]:
y_pred = pipeline.predict(x_test)

In [38]:
y_pred = np.expm1(y_pred)

In [39]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.5702086958364329

In [40]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(x_train,y_train)

    y_pred = pipeline.predict(x_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [41]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [42]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [43]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [44]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894125,0.405873
5,random forest,0.893132,0.406146
10,xgboost,0.89488,0.421899
9,mlp,0.873924,0.477177
7,gradient boosting,0.878667,0.483437
0,linear_reg,0.856772,0.570209
2,ridge,0.856701,0.575519
4,decision tree,0.805882,0.588138
1,svr,0.775235,0.761087
8,adaboost,0.751972,0.818587


In [45]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [46]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [47]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [48]:
scores.mean()

0.05777081334919898

In [49]:
scores.std()

0.015234267535050562

In [50]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)), # because OHE increases dimensionality
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(x_train,y_train)

    y_pred = pipeline.predict(x_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [51]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [52]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [53]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [54]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.758051,0.663491
6,extra trees,0.730099,0.667276
4,decision tree,0.684469,0.73432
10,xgboost,0.608526,0.867191
7,gradient boosting,0.618392,0.953624
1,svr,0.210965,1.340977
8,adaboost,0.305245,1.346201
9,mlp,0.20553,1.390622
3,LASSO,0.054501,1.474926
2,ridge,0.057771,1.478109


In [55]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [56]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [57]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [58]:
scores.mean(),scores.std()

(0.8312711872562375, 0.02261773176410284)

In [59]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(x_train,y_train)

    y_pred = pipeline.predict(x_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [60]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [61]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [62]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [63]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.904529,0.404867
10,xgboost,0.901299,0.414376
6,extra trees,0.904736,0.415973
7,gradient boosting,0.890639,0.470243
4,decision tree,0.808157,0.525281
9,mlp,0.850079,0.566021
0,linear_reg,0.831271,0.63815
2,ridge,0.831284,0.638382
8,adaboost,0.813579,0.679871
1,svr,0.787686,0.748268


In [64]:
# conclusion -> tree based models like xgboost and random forest are giving the best result
# encoding -> target encoding of sector

In [65]:
from sklearn.model_selection import GridSearchCV

In [66]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['log2', 'sqrt', None, 0.5]
}

In [67]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [68]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [69]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [70]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [71]:
search.fit(x, y_transformed)

Fitting 10 folds for each of 256 candidates, totalling 2560 fits


In [72]:
final_pipe = search.best_estimator_

In [73]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 0.5,
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [74]:
search.best_score_

0.9081315139809313

In [75]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [76]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [77]:
pipeline.fit(x,y_transformed)

In [78]:
 y_pred = pipeline.predict(x_test)

In [79]:
 y_pred = np.expm1(y_pred)

In [80]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [81]:
scores.mean(),scores.std()

(0.8939269947430555, 0.02774661600128804)

In [82]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.1445297358071627

In [83]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [84]:
with open('df.pkl', 'wb') as file:
    pickle.dump(x, file)

Trying out the predictions

In [85]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [86]:
np.expm1(pipeline.predict(one_df))

array([3.1260867])

In [87]:
import numpy
print(numpy.__version__)

1.26.4


In [88]:
pip install numpy==1.26.4
# to match streamlit's numpy version

