In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df=pd.read_csv('modified.csv')

In [4]:
df.isnull().sum()

price              0
bedRoom            0
bathroom           0
balcony            0
floor_type         0
agePossession      0
sector             0
type               0
built_up_area      0
servant room       0
pooja room         0
furnishing_type    0
luxury             0
dtype: int64

In [5]:
df.head(5)

Unnamed: 0,price,bedRoom,bathroom,balcony,floor_type,agePossession,sector,type,built_up_area,servant room,pooja room,furnishing_type,luxury
0,0.45,2.0,2.0,1,low floor,relatively new,sector 7,flat,900.0,no,no,unfurnished,low
1,0.5,2.0,2.0,1,low floor,old property,sector 3,flat,650.0,no,no,semifurnished,normal
2,0.4,2.0,2.0,3,high floor,new property,sohna road,flat,595.0,no,no,unfurnished,normal
3,1.47,2.0,2.0,2,low floor,new property,sector 61,flat,1200.0,no,no,unfurnished,semi_luxrious
4,0.7,2.0,2.0,3,mid floor,under construction,sector 92,flat,1076.0,no,no,unfurnished,low


In [6]:
print(df['balcony'].unique())
print(df['floor_type'].unique())
print(df['furnishing_type'].unique())
print(df['luxury'].unique())
print(df['agePossession'].unique())

['1' '3' '2' '3+' '0']
['low floor' 'high floor' 'mid floor' 'groundfloor' 'hometop']
['unfurnished' 'semifurnished' 'furnished']
['low' 'normal' 'semi_luxrious' 'luxrious']
['relatively new' 'old property' 'new property' 'under construction'
 'moderately old']


In [7]:
x=df.drop(columns=['price'],axis=1)
y=np.log1p(df['price'])

In [8]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [9]:
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,FunctionTransformer
from sklearn.compose import ColumnTransformer
from scipy.stats import yeojohnson
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error,mean_squared_error


In [10]:
df['furnishing_type'].unique()

array(['unfurnished', 'semifurnished', 'furnished'], dtype=object)

In [11]:
def yejhonson_transform(x):
    transformed,_ =yeojohnson(x)
    return transformed

In [12]:
transformer=ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(drop='first',sparse_output=False,),['agePossession','type']),
    ('tnf2',OrdinalEncoder(categories=[['no','yes']]),['servant room']),
    ('tnf3',OrdinalEncoder(categories=[['no','yes']]),['pooja room']),
     ('tnf4',OrdinalEncoder(categories=[['groundfloor','mid floor','high floor','low floor','hometop']]),['floor_type']),
    ('tnf5',OrdinalEncoder(categories=[['low','normal','semi_luxrious','luxrious']]),['luxury']),
    ('tnf6',OrdinalEncoder(categories=[['1','0','2','3','3+']]),['balcony']),
    ('tnf7',OrdinalEncoder(categories=[['unfurnished', 'semifurnished', 'furnished']]),['furnishing_type']),
    ('tnf8',ce.TargetEncoder(),['sector']),
],remainder='passthrough')


In [13]:
transformer

# Model evaluation

In [21]:
def model_evaluation(model_name,model)->list:
  models=[]
  models.append(model_name)
  pipeline=Pipeline([
      ('preproceesing',transformer),
      ('scaling',StandardScaler()),
      ('model',model)
  ])
  # k-fold cross validation
  kfold = KFold(n_splits=10, shuffle=True, random_state=42)
  scores = cross_val_score(pipeline, x, y, cv=kfold, scoring='r2')
  models.append(scores.mean())# r2 score
  # train_test _split
  x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)
  pipeline.fit(x_train,y_train)
  y_pred = pipeline.predict(x_test)
  y_pred = np.expm1(y_pred)
  models.append(mean_absolute_error(np.expm1(y_test),y_pred))# mean absolute error
  models.append(scores.std())

  return models




In [22]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor(),
    'KNN':KNeighborsRegressor(n_neighbors=5)
}


In [23]:
output=[]
for model_name,model in model_dict.items():
  output.append(model_evaluation(model_name,model))
output=pd.DataFrame(output,columns=['model','r2','mae','std'])



In [24]:
output.sort_values(['mae'])

Unnamed: 0,model,r2,mae,std
6,extra trees,0.883981,0.520201,0.019418
10,xgboost,0.878097,0.545499,0.020113
5,random forest,0.881198,0.552902,0.020634
7,gradient boosting,0.863904,0.634386,0.020765
1,svr,0.840997,0.637279,0.026312
9,mlp,0.828408,0.702794,0.022856
4,decision tree,0.7945,0.730413,0.032466
11,KNN,0.795943,0.763155,0.02858
2,ridge,0.793154,0.796989,0.030757
0,linear_reg,0.793151,0.797061,0.030762


**Extra tree have extropolation issue and i deploy the extra tree model **

# xgboost model

In [25]:
pipeline=Pipeline([
      ('preproceesing',transformer),
      ('scaling',StandardScaler()),
      ('regressor',XGBRegressor())
  ])

In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
# hyperparameter tuning
param_grid={
    'regressor__booster':['gbtree','dart'],
    'regressor__n_estimators': [50, 100, 150],
    'regressor__learning_rate':np.linspace(0.01, 0.2, 10),
    'regressor__min_split_loss':[0,0.1,0.2,0.3,0.4],
    'regressor__max_depth':[6,5],
    'regressor__min_child_weight':[1,5,10],
    'regressor__reg_lambda':[0.1,0.5,0.3],
    'regressor__reg_alpha':[0.1,0.5,0.3],
    'regressor__tree_method':['hist','approx','auto'],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

In [28]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [29]:
# randomized search cv
random_cv=RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    n_iter=100,
    cv=kfold,
    n_jobs=-1
)
random_cv.fit(x,y)
best_params = random_cv.best_params_
best_model = random_cv.best_estimator_
print(best_params)
print(best_model)


{'regressor__tree_method': 'approx', 'regressor__reg_lambda': 0.1, 'regressor__reg_alpha': 0.3, 'regressor__n_estimators': 100, 'regressor__min_split_loss': 0, 'regressor__min_child_weight': 1, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.1577777777777778, 'regressor__colsample_bytree': 0.8, 'regressor__booster': 'dart'}
Pipeline(steps=[('preproceesing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tnf1',
                                                  OneHotEncoder(drop='first',
                                                                sparse_output=False),
                                                  ['agePossession', 'type']),
                                                 ('tnf2',
                                                  OrdinalEncoder(categories=[['no',
                                                                              'yes']]),
                                         

In [30]:
pipeline.set_params(**best_params)

In [31]:
scores = cross_val_score(pipeline, x, y, cv=kfold, scoring='r2')
print('r2_score :',scores.mean())
print('variance :',scores.std())

r2_score : 0.8792992546016907
variance : 0.005752632328894826


# Exporting model

In [32]:
import pickle
with open('xgboost_pipeline.pkl','wb') as file:
  pickle.dump(pipeline,file)