# **Delhi House Price Prediction**

### Importing Required Libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings(action='ignore')

### Load Data

In [2]:
df = pd.read_csv('MagicBricks.csv')

In [3]:
# Lets check first five rows of the data
df.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


In [4]:
# Data Shape
df.shape

(1259, 11)

In [5]:
# lets check the missing values in dataframe
df.isnull().sum()

Area             0
BHK              0
Bathroom         2
Furnishing       5
Locality         0
Parking         33
Price            0
Status           0
Transaction      0
Type             5
Per_Sqft       241
dtype: int64

<p>As we can see that there are missing values present in our dataset which we need to handle.</p>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


### Data Preprocessing

In [7]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns if not column == 'Locality'}

In [8]:
def get_categorical_columns(df):
    return [column for column in df.columns if df.dtypes[column] == 'object']

In [9]:
get_uniques(df, get_categorical_columns(df))

{'Furnishing': ['Semi-Furnished', 'Furnished', 'Unfurnished', nan],
 'Status': ['Ready_to_move', 'Almost_ready'],
 'Transaction': ['New_Property', 'Resale'],
 'Type': ['Builder_Floor', 'Apartment', nan]}

In [10]:
def onehot_encoder(df, column, rename=False):
    df = df.copy()
    if rename == True:
        df[column] = df[column].replace({x: i for i, x in enumerate(df[column].unique())})
    dummies_df = pd.get_dummies(df[column], prefix=column,drop_first=True)
    df = pd.concat([df, dummies_df], axis=1)
    df.drop(column, axis=1,inplace=True)
    return df

In [11]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Per_Sqft column
    df.drop('Per_Sqft', axis=1,inplace=True)
    
    # Fill missing values by mode because they are categorical columns
    missing_values_col = ['Bathroom', 'Parking', 'Type']
    for col in missing_values_col:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # One-hot encoding
    df = onehot_encoder(df, column='Furnishing', rename=False)
    df = onehot_encoder(df, column='Locality', rename=True)
    
    # Binary encoding
    df['Status'] = df['Status'].replace({
        'Almost_ready': 0,
        'Ready_to_move': 1
    })
    df['Transaction'] = df['Transaction'].replace({
        'New_Property': 0,
        'Resale': 1
    })
    df['Type'] = df['Type'].replace({
        'Builder_Floor': 0,
        'Apartment': 1
    })
    
    # Split data into X and y
    y = df['Price']
    X = df.drop('Price', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=50)
    
    # Scale X
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(sc.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)

In [13]:
# Models
models = [
    LinearRegression(),
    Lasso(),
    Ridge(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    XGBRegressor()
]
model_names = [
    'LinearRegression',
    'Lasso',
    'Ridge',
    'KNeighborsRegressor',
    'DecisionTreeRegressor',
    'RandomForestRegressor',
    'XGBRegressor']

score = []
dictionary = {}
for model in range(len(models)):
    reg = models[model]
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    score.append(r2_score(y_test,y_pred))
     
dictionary = {'Model Names':model_names,'R2 Score': score}
# Put the accuracies in a data frame.
score_df = pd.DataFrame(dictionary)
score_df.style.set_precision(5)

Unnamed: 0,Model Names,R2 Score
0,LinearRegression,-1.3359109501068867e+25
1,Lasso,0.57197
2,Ridge,0.57759
3,KNeighborsRegressor,0.5063
4,DecisionTreeRegressor,0.69448
5,RandomForestRegressor,0.76096
6,XGBRegressor,0.78661


### Hyperparamter Tuning

In [15]:
model_params = {
    'XGBRegressor':{
        'model':XGBRegressor(),
        'params':{
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7, 10],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.5, 0.7],
            'colsample_bytree': [0.5, 0.7],
            'n_estimators' : [100, 200, 500],
            'objective': ['reg:squarederror']
        }
    },

    'RandomForestRegressor':{
        'model':RandomForestRegressor(),
        'params':{
            'n_estimators':range(100,300,50),
            'criterion':['mse','mae'],
            'max_features':['auto', 'sqrt', 'log2']
        }
    }
}

In [18]:
scores = []
for model_name,model in model_params.items():
    grid = GridSearchCV(model['model'],model['params'],cv=5,return_train_score=False,scoring='r2')
    grid.fit(X_train,y_train)
    scores.append({
      'model':model_name,
      'best_score':grid.best_score_,
      'best_params':grid.best_params_
  })

In [19]:
df_scores = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_scores

Unnamed: 0,model,best_score,best_params
0,XGBRegressor,0.80439,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."
1,RandomForestRegressor,0.796904,"{'criterion': 'mae', 'max_features': 'auto', '..."


### Best Fit

In [24]:
## XGBoost
xgb = XGBRegressor(
    colsample_bytree= 0.7,
    learning_rate= 0.01,
    max_depth= 7,
    min_child_weight = 1,
    n_estimators=500,
    objective = 'reg:squarederror',
    subsample= 0.7
)

xgb.fit(X_train,y_train)
xgb_pred = xgb.predict(X_test)

print(" R^2 Score: {:.5f}".format(r2_score(y_test,xgb_pred)))

 R^2 Score: 0.79267


In [25]:
## Random Forest
rf = RandomForestRegressor(
    criterion= 'mae',
    max_features = 'auto',
    n_estimators= 250
)

rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

print(" R^2 Score: {:.5f}".format(r2_score(y_test,rf_pred)))

 R^2 Score: 0.77872


In [26]:
# Save the tuned Model to file in the current working directory
import pickle

Pkl_Filename = "XGB.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(xgb, file)