In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('MagicBricks.csv')

In [4]:
data.head(51)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,15500000,Ready_to_move,New_Property,Builder_Floor,6667.0
6,1350.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,10000000,Ready_to_move,Resale,Builder_Floor,6667.0
7,650.0,2,2.0,Semi-Furnished,"Delhi Homes, Rohini Sector 24",1.0,4000000,Ready_to_move,New_Property,Apartment,6154.0
8,985.0,3,3.0,Unfurnished,Rohini Sector 21,1.0,6800000,Almost_ready,New_Property,Builder_Floor,6154.0
9,1300.0,4,4.0,Semi-Furnished,Rohini Sector 22,1.0,15000000,Ready_to_move,New_Property,Builder_Floor,6154.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


In [6]:
def onehot_encode(df, column, rename=False):
    df = df.copy()
    if rename == True:
        df[column] = df[column].replace({x: i for i, x in enumerate(df[column].unique())})
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [7]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Per_Sqft column
    df = df.drop('Per_Sqft', axis=1)
    
    # Fill missing values
    for column in ['Bathroom', 'Parking', 'Type']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Binary encoding
    df['Status'] = df['Status'].replace({
        'Almost_ready': 0,
        'Ready_to_move': 1
    })
    df['Transaction'] = df['Transaction'].replace({
        'New_Property': 0,
        'Resale': 1
    })
    df['Type'] = df['Type'].replace({
        'Builder_Floor': 0,
        'Apartment': 1
    })
    
    # One-hot encoding
    df = onehot_encode(df, column='Furnishing', rename=False)
    df = onehot_encode(df, column='Locality', rename=True)
    
    # Split df into X and y
    y = df['Price']
    X = df.drop('Price', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [10]:
X_train

Unnamed: 0,Area,BHK,Bathroom,Parking,Status,Transaction,Type,Furnishing_Furnished,Furnishing_Semi-Furnished,Furnishing_Unfurnished,...,Locality_355,Locality_356,Locality_357,Locality_358,Locality_359,Locality_360,Locality_361,Locality_362,Locality_363,Locality_364
582,1.241514,1.245726,1.397555,-0.216906,0.245293,0.779528,-0.963219,-0.412294,0.872926,-0.631199,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
976,0.006189,0.198508,-0.550619,-0.216906,0.245293,0.779528,1.038186,2.425451,-1.145572,-0.631199,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
886,-0.748732,-1.895929,-1.524706,-0.216906,0.245293,-1.282827,1.038186,-0.412294,-1.145572,1.584285,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
561,0.177762,0.198508,0.423468,0.127408,0.245293,0.779528,-0.963219,-0.412294,0.872926,-0.631199,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
1083,-0.587454,-0.848710,-0.550619,-0.216906,0.245293,0.779528,-0.963219,-0.412294,0.872926,-0.631199,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,-0.165384,-0.848710,-0.550619,-0.216906,0.245293,0.779528,1.038186,-0.412294,-1.145572,1.584285,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
905,1.069941,1.245726,1.397555,0.127408,0.245293,-1.282827,-0.963219,-0.412294,-1.145572,1.584285,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
1096,-0.371272,-0.848710,-0.550619,0.127408,0.245293,-1.282827,-0.963219,-0.412294,-1.145572,1.584285,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371
235,0.074818,1.245726,0.423468,-0.216906,0.245293,-1.282827,-0.963219,-0.412294,0.872926,-0.631199,...,-0.058454,-0.03371,0.0,0.0,-0.144421,0.0,-0.03371,-0.03371,-0.03371,-0.03371


In [11]:
y_train

582     55000000
976     14000000
886      1490000
561     30000000
1083     4000000
          ...   
715     14300000
905     67000000
1096     5500000
235     13000000
1061     8000000
Name: Price, Length: 881, dtype: int64

In [12]:
X_train.mean()

Area           -7.057036e-17
BHK             2.026378e-16
Bathroom       -8.065184e-18
Parking        -2.621185e-17
Status          1.532385e-16
                    ...     
Locality_360    0.000000e+00
Locality_361    1.411407e-17
Locality_362    1.613037e-17
Locality_363    1.814666e-17
Locality_364    1.814666e-17
Length: 375, dtype: float64

In [13]:
X_train.var()

Area            1.001136
BHK             1.001136
Bathroom        1.001136
Parking         1.001136
Status          1.001136
                  ...   
Locality_360    0.000000
Locality_361    1.001136
Locality_362    1.001136
Locality_363    1.001136
Locality_364    1.001136
Length: 375, dtype: float64

In [14]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [15]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f0df6162b60>
Traceback (most recent call last):
  File "/home/om_prakash/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/om_prakash/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/om_prakash/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/om_prakash/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' ob

                     Linear Regression R^2 Score: -23524136129029286918094848.00000
 Linear Regression (L2 Regularization) R^2 Score: 0.67672
 Linear Regression (L1 Regularization) R^2 Score: 0.67647
                   K-Nearest Neighbors R^2 Score: 0.59447
                        Neural Network R^2 Score: -0.62239
Support Vector Machine (Linear Kernel) R^2 Score: -0.62248
   Support Vector Machine (RBF Kernel) R^2 Score: -0.07453
                         Decision Tree R^2 Score: 0.69997
                         Random Forest R^2 Score: 0.79254
                     Gradient Boosting R^2 Score: 0.84284


In [183]:
import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(model, 'filename.pkl') 

['filename.pkl']