In [34]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 1.4 MB/s eta 0:01:14
   ---------------------------------------- 0.3/99.8 MB 5.1 MB/s eta 0:00:20
   ---------------------------------------- 0.8/99.8 MB 7.0 MB/s eta 0:00:15
   ---------------------------------------- 0.9/99.8 MB 6.6 MB/s eta 0:00:16
   ---------------------------------------- 1.2/99.8 MB 5.9 MB/s eta 0:00:17
    --------------------------------------- 1.5/99.8 MB 6.0 MB/s eta 0:00:17
    --------------------------------------- 1.9/99.8 MB 6.2 MB/s eta 0:00:16
    --------------------------------------- 2.2/99.8 MB 6.2 MB/s eta 0:00:16
   - -------------------------------------- 2.6/99.8 MB 6.4 MB/s eta 0:00:16
   - -------------------------------------- 3.0/99.8 MB 6.7 MB/s eta 0:00:15
   - --------

In [101]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [3]:
df = pd.read_csv('cleaned_data/gurgaon_properties_post_feature_selection_v2.csv')

In [4]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [194]:
df['luxury_category'].value_counts()

Low       1594
Medium    1465
High       495
Name: luxury_category, dtype: int64

In [5]:
df['furnishing_type'].value_counts()

0.0    2349
1.0    1018
2.0     187
Name: furnishing_type, dtype: int64

In [6]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

df['furnishing_type'] = df['furnishing_type'].replace({
    0.0 : 'unfurnished',
    1.0 : 'semifurnished',
    2.0 : 'furnished'
})

In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [8]:
X = df.drop(columns=['price'])
y= df['price']

In [9]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [10]:
y_transformed

0       0.598837
1       0.667829
2       0.277632
3       0.955511
4       0.392042
          ...   
3549    0.314811
3550    1.945910
3551    0.470004
3552    2.803360
3553    1.022451
Name: price, Length: 3554, dtype: float64

### Ordinal Encoding

In [11]:
columns_to_encode = ['property_type','sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [12]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
transformers=[
    ('num',StandardScaler(),['bedRoom','bathroom','built_up_area','servant room','store room']),
    ('cat',OrdinalEncoder(),columns_to_encode)
],
    remainder='passthrough'
)

In [13]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [14]:
# K-fold cross-validation
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [16]:
scores.mean(), scores.std()

(0.7363096633436828, 0.03238005754429937)

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [27]:
pipeline.fit(X_train,y_train)

In [28]:
y_pred = pipeline.predict(X_test)

In [29]:
y_pred = np.expm1(y_pred)

In [30]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089357

In [31]:
def scorer(model_name,model):
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
     
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [36]:
model_dict = {
    'linear_reg':LinearRegression(),
    'SVR':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [37]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [38]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089357],
 ['SVR', 0.7642012011196353, 0.8472636473483951],
 ['ridge', 0.7363125343993554, 0.9463387741853383],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7680979906659116, 0.7516785171484365],
 ['random forest', 0.8816915312512492, 0.5321461353025355],
 ['extra trees', 0.8684190066896905, 0.5412815940692783],
 ['gradient boosting', 0.8727211856507674, 0.5753616385658222],
 ['adaboost', 0.7494181104842216, 0.824568160717483],
 ['mlp', 0.8098956055594444, 0.7157314207387345],
 ['xgboost', 0.8894876835260124, 0.5040475141482346]]

In [39]:
model_df = pd.DataFrame(model_output,columns=['name','r2','mae'])

In [40]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.881692,0.532146
6,extra trees,0.868419,0.541282
7,gradient boosting,0.872721,0.575362
9,mlp,0.809896,0.715731
4,decision tree,0.768098,0.751679
8,adaboost,0.749418,0.824568
1,SVR,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [41]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [42]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [43]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [44]:
scores.mean()

0.8546034234543891

In [45]:
scores.std()

0.016001151002273246

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [47]:
pipeline.fit(X_train,y_train)

In [48]:
y_pred = pipeline.predict(X_test)

In [49]:
y_pred = np.expm1(y_pred)

In [50]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497617044593595

In [51]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [52]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [53]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [54]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [55]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894532,0.472745
10,xgboost,0.89585,0.493456
5,random forest,0.891575,0.496811
9,mlp,0.872796,0.546527
7,gradient boosting,0.876794,0.570148
0,linear_reg,0.854603,0.649762
2,ridge,0.854679,0.652834
4,decision tree,0.802957,0.700756
8,adaboost,0.752441,0.832592
1,svr,0.769741,0.834124


### OneHotEncoding with PCA

In [56]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
transformers=[
    ('num',StandardScaler(),['bedRoom','bathroom','built_up_area','servant room','store room']),
    ('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
],
remainder='passthrough')

In [60]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('pca',PCA(n_components=0.95)),
    ('regressor',LinearRegression())
])

In [61]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2',error_score='raise')

In [62]:
scores.mean()

0.06225201431451136

In [63]:
scores.std()

0.01986059407164015

In [64]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [65]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [66]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [67]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [68]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.761538,0.657998
6,extra trees,0.738942,0.701159
4,decision tree,0.696182,0.75729
10,xgboost,0.620664,0.948597
7,gradient boosting,0.610604,0.987906
1,svr,0.218073,1.361163
8,adaboost,0.301174,1.400952
9,mlp,0.199584,1.409537
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707


### Target Encoder

In [70]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB 435.7 kB/s eta 0:00:01
   ----------------------------------- ---- 71.7/81.9 kB 653.6 kB/s eta 0:00:01
   ---------------------------------------- 81.9/81.9 kB 761.9 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [71]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [72]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [73]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [74]:
scores.mean(),scores.std()

(0.829521918225536, 0.018384463379122862)

In [75]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [76]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [77]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [78]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [79]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.90028,0.449554
6,extra trees,0.901946,0.456338
7,gradient boosting,0.889012,0.509361
4,decision tree,0.827711,0.528511
9,mlp,0.851647,0.631114
8,adaboost,0.819226,0.698029
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning

In [162]:
from sklearn.model_selection import GridSearchCV

RandomForestRegressor

In [197]:
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 5, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'criterion': ['mse', 'mae'],
#     'random_state': [42],
#     'max_samples': [None, 0.5, 0.8],
# }
# param_grid = {
#     'regressor__n_estimators': [50, 100, 200],
#     'regressor__max_depth': [None, 5, 10, 20],
#     'regressor__min_samples_split': [2, 5, 10],
#     'regressor__min_samples_leaf': [1, 2, 4],
#     'regressor__max_features': ['auto', 'sqrt', 'log2'],
#     'regressor__bootstrap': [True, False],
#     'regressor__random_state': [42],
#     'regressor__max_samples': [None, 0.5, 0.8],
# }
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False],
    'regressor__random_state': [42],
}

In [198]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [199]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [200]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [201]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [202]:
search.fit(X,y_transformed) 

Fitting 10 folds for each of 648 candidates, totalling 6480 fits


2160 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1071 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ogeti\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ogeti\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ogeti\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ogeti\anaconda3\Lib\site-packages\sklearn\base.py",

In [203]:
search.best_estimator_

In [204]:
search.best_score_

0.9050331770514456

In [205]:
final_pipe = search.best_estimator_

In [236]:
data = [['flat', 'sector 53', 4, 4, '3+', 'Old Property', 3979, 1, 0, 'semifurnished', 'Low', 'Mid Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 53,4,4,3+,Old Property,3979,1,0,semifurnished,Low,Mid Floor


In [237]:
np.expm1(final_pipe.predict(one_df))

array([8.413906], dtype=float32)

XGBoost

In [117]:
# param_grid1 = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 5, 7],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.2]
# }
param_grid1 = {
    'xgbregressor__learning_rate': [0.01, 0.1, 0.2],
    'xgbregressor__n_estimators': [100, 200, 300,500],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__min_child_weight': [1, 3, 5],
    'xgbregressor__subsample': [0.8, 0.9, 1.0],
    'xgbregressor__colsample_bytree': [0.8, 0.9, 1.0],
    'xgbregressor__gamma': [0, 0.1, 0.2]
}

In [118]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [119]:
xgb_regressor = XGBRegressor()

In [120]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgbregressor', xgb_regressor)
])

In [121]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [122]:
search1 = GridSearchCV(pipeline, param_grid1, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [123]:
search1.fit(X,y_transformed)

Fitting 10 folds for each of 2187 candidates, totalling 21870 fits


In [124]:
search1.best_estimator_

In [208]:
search1.best_params_

{'xgbregressor__colsample_bytree': 0.9,
 'xgbregressor__gamma': 0,
 'xgbregressor__learning_rate': 0.1,
 'xgbregressor__max_depth': 5,
 'xgbregressor__min_child_weight': 1,
 'xgbregressor__n_estimators': 300,
 'xgbregressor__subsample': 0.9}

In [209]:
search1.best_score_

0.9082990444493421

In [210]:
final_pipe = search1.best_estimator_

In [211]:
final_pipe.fit(X,y_transformed)

In [260]:
data = [['flat', 'sector 110', 5, 5, '3+', 'New Property', 3880, 1, 0, 'semifurnished', 'Low', 'Mid Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 110,5,5,3+,New Property,3880,1,0,semifurnished,Low,Mid Floor


In [261]:
np.expm1(final_pipe.predict(one_df))

array([4.4934955], dtype=float32)

### Exporting the model

In [262]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [263]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(bootstrap=True, max_features='sqrt', n_estimators=500,
                      random_state=42))
])


In [264]:
pipeline.fit(X,y_transformed)

In [271]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [272]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

### Trying out the predictions

In [267]:
data = [['flat', 'sector 108', 2, 3, '3', 'New Property', 1280, 1, 0, 'unfurnished', 'Low', 'Mid Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 108,2,3,3,New Property,1280,1,0,unfurnished,Low,Mid Floor


In [268]:
np.expm1(pipeline.predict(one_df))

array([1.22789562])

In [269]:
data = [['flat', 'sector 93', 3, 2, '3', 'New Property', 1280, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 93,3,2,3,New Property,1280,0,0,unfurnished,Low,Low Floor


In [270]:
np.expm1(pipeline.predict(one_df))

array([0.99441134])