In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [4]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,27.0,3,4,4.0,2.0,3201.0,1,0,1,2.0,2.0,4.25
1,0.0,71.0,2,2,3.0,1.0,1400.0,0,0,0,2.0,2.0,1.18
2,0.0,77.0,2,2,2.0,3.0,1057.0,0,0,0,2.0,2.0,0.71
3,0.0,95.0,1,1,1.0,3.0,407.0,0,0,0,2.0,2.0,0.2
4,0.0,91.0,3,4,3.0,3.0,1765.0,1,0,0,0.0,2.0,0.9


In [8]:
# baseline mode -> linear model -> why? because we want to predict price
# to apply linear mode -> convert ordinal encoding to one hot encoding
# also do scaling

In [9]:
x = df.drop(columns=['price'])
y = df['price']

In [11]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [12]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [13]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [14]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # transformation 1-> scaling
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        # transformation 2->one hot encoding
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough'
)

In [15]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [16]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [17]:
scores.mean()

np.float64(0.8861231449716325)

In [18]:
scores.std() # less variation -> consistent result

np.float64(0.014715318611719194)

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [22]:
pipeline.fit(x_train,y_train)

In [23]:
y_pred = pipeline.predict(x_test)

In [24]:
y_pred = np.expm1(y_pred) # bring data back to nomral scale

In [25]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred) # because we have outliers

0.5594837734791199