##Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##Loading dataset

In [2]:
df=pd.read_csv('house_prices.csv')

In [3]:
df.head()

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,...,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,...,,,Srushti Siddhi Mangal Murti Complex,1,2.0,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,...,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,...,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,...,,,,1,1.0,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,...,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,


In [4]:
df.shape

(83079, 21)

##Data Processing

In [5]:
df.info

In [6]:
df.describe()

Unnamed: 0,Index,Price (in rupees),Dimensions,Plot Area
count,83079.0,75949.0,0.0,0.0
mean,41539.0,7909.079698,,
std,23982.985844,4741.423164,,
min,0.0,17.0,,
25%,20769.5,5250.0,,
50%,41539.0,6522.0,,
75%,62308.5,9161.0,,
max,83078.0,84619.0,,


In [7]:
df = df.drop('Index', axis = 'columns')

In [8]:
df.rename(columns = { 'Amount(in rupees)': 'Amount_in_rupees',
                     'Price (in rupees)': 'Price_in_repees',
                     'Carpet Area': 'Carpet_area_in_sqft'}, inplace = True)

##Check for null values

In [9]:
null_percent = df.isnull().mean() * 100
null_percent

Unnamed: 0,0
Title,0.0
Description,1.658662
Amount_in_rupees,0.0
Price_in_repees,8.582193
location,0.0
Carpet_area_in_sqft,49.905512
Status,0.39962
Floor,4.754511
Transaction,0.033703
Furnishing,2.406144


In [10]:
# Find columns where null percentage is greater than 50
miss_value_50_perc = null_percent[null_percent > 50]

# Drop the columns from the DataFrame
df = df.drop(columns=miss_value_50_perc.index)

In [11]:
df.shape

(83079, 15)

##Convert data from string to float

In [12]:
## Handling Carpet Area:
def convert_to_sqft(area):
    try:
        if pd.notnull(area):
            if 'sqft' in area:
                area= float(area.replace(' sqft',''))
            else:
                area=float(area.replace(' sqm',''))*10.7639
            return area
    except ValueError:
        return np.nan

df['Carpet_area_in_sqft']=df['Carpet_area_in_sqft'].apply(convert_to_sqft)

In [13]:
## Handling the amount in rupees

def convert_rupees(amount_str):
    try:
        parts=amount_str.split()
        amount=float(parts[0])

        if len(parts)>1:
            unit=parts[1].strip()
            if unit=='Lac':
                amount*=100000
            elif unit=='Cr':
                amount*=10000000
        return amount
    except(ValueError,IndexError):
        return None

df['Amount_in_rupees']= df['Amount_in_rupees'].apply(convert_rupees)

In [14]:
df.describe()

Unnamed: 0,Amount_in_rupees,Price_in_repees,Carpet_area_in_sqft
count,79275.0,75949.0,39317.0
mean,13990610.0,7909.079698,1424.345611
std,15721570.0,4741.423164,1479.182124
min,200000.0,17.0,25.0
25%,6355000.0,5250.0,900.0
50%,8800000.0,6522.0,1175.0
75%,15500000.0,9161.0,1700.0
max,600000000.0,84619.0,194936.0


##Fill the null values

###For Categorical Data

In [15]:
categorical_features = df.select_dtypes(include = ['object']).columns
categorical_features.tolist()

['Title',
 'Description',
 'location',
 'Status',
 'Floor',
 'Transaction',
 'Furnishing',
 'facing',
 'overlooking',
 'Bathroom',
 'Balcony',
 'Ownership']

In [16]:
for categorical_feature in categorical_features :
    if df[categorical_feature].isnull().sum() != 0 :
        df[categorical_feature].replace(np.nan, df[categorical_feature].mode()[0], inplace=True)

In [17]:
df.isnull().sum()

Unnamed: 0,0
Title,0
Description,0
Amount_in_rupees,3804
Price_in_repees,7130
location,0
Carpet_area_in_sqft,43762
Status,0
Floor,0
Transaction,0
Furnishing,0


###For numaric data

In [18]:
numaric_features = df.select_dtypes(include = ['float64']).columns
numaric_features.tolist()

['Amount_in_rupees', 'Price_in_repees', 'Carpet_area_in_sqft']

In [19]:
for numaric_feature in numaric_features :
    if df[numaric_feature].isnull().sum() != 0 :
        df[numaric_feature].replace(np.nan, df[numaric_feature].mean(), inplace=True)

In [20]:
df.isnull().sum()

Unnamed: 0,0
Title,0
Description,0
Amount_in_rupees,0
Price_in_repees,0
location,0
Carpet_area_in_sqft,0
Status,0
Floor,0
Transaction,0
Furnishing,0


###Analysis

In [21]:
# Calculate correlation matrix
correlation_matrix = df.select_dtypes(include=['int64', 'float64']).corr()

# Display correlation matrix
print(correlation_matrix)

                     Amount_in_rupees  Price_in_repees  Carpet_area_in_sqft
Amount_in_rupees             1.000000         0.757756             0.363830
Price_in_repees              0.757756         1.000000             0.165931
Carpet_area_in_sqft          0.363830         0.165931             1.000000


###Encoding of categorical data

In [22]:
# Status has only one value for all example that's why remove it.
df.drop(['Status', 'Title', 'Description'], axis = 'columns', inplace = True)

In [23]:
df.head()

Unnamed: 0,Amount_in_rupees,Price_in_repees,location,Carpet_area_in_sqft,Floor,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership
0,4200000.0,6000.0,thane,500.0,10 out of 11,Resale,Unfurnished,East,Garden/Park,1,2,Freehold
1,9800000.0,13799.0,thane,473.0,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,2,2,Freehold
2,14000000.0,17500.0,thane,779.0,10 out of 29,Resale,Unfurnished,East,Garden/Park,2,2,Freehold
3,2500000.0,7909.079698,thane,530.0,1 out of 3,Resale,Unfurnished,East,Garden/Park,1,1,Freehold
4,16000000.0,18824.0,thane,635.0,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",2,2,Co-operative Society


In [24]:
from sklearn.preprocessing import LabelEncoder

def label_encode_multiple(df, columns):
    encoder = LabelEncoder()
    for column in columns:
        df[column] = encoder.fit_transform(df[column])
    return df

label_encode_columns = ['Transaction','location', 'Furnishing', 'facing', 'overlooking', 'Ownership']
df = label_encode_multiple(df, label_encode_columns)

In [25]:
df['Floor'].value_counts()

Unnamed: 0_level_0,count
Floor,Unnamed: 1_level_1
1 out of 4,10283
2 out of 4,3959
4 out of 4,2907
2 out of 5,2765
3 out of 4,2710
...,...
8 out of 26,1
30 out of 44,1
51 out of 78,1
12 out of 52,1


In [26]:
# Split 'Floor' into two separate columns: current_floor and total_floors
df[['current_floor', 'total_floors']] = df['Floor'].str.split(' out of ', expand=True)

# Display the DataFrame with the new columns
print(df[['Floor', 'current_floor', 'total_floors']])

              Floor current_floor total_floors
0      10 out of 11            10           11
1       3 out of 22             3           22
2      10 out of 29            10           29
3        1 out of 3             1            3
4      20 out of 42            20           42
...             ...           ...          ...
83074    4 out of 6             4            6
83075    1 out of 4             1            4
83076    1 out of 4             1            4
83077    1 out of 4             1            4
83078    1 out of 6             1            6

[83079 rows x 3 columns]


In [27]:
encoding_map = {
    'Ground': 0,
    'Upper Basement': -1,
    'Lower Basement': -2,

}

# Apply the encoding to the 'Ownership' column
df['current_floor'] = df['current_floor'].replace(encoding_map)

In [28]:
df['current_floor'] = df['current_floor'].astype(int)

In [29]:
df['total_floors'] = df['total_floors'].fillna(df['total_floors'].mode()[0])

# Convert 'total_floors' column to integer data type
df['total_floors'] = df['total_floors'].astype(int)

In [30]:
df.drop('Floor', axis = 'columns', inplace = True)

In [31]:
df['Bathroom'] = df['Bathroom'].replace({"> 10" : 11})
df['Bathroom'].value_counts()

Unnamed: 0_level_0,count
Bathroom,Unnamed: 1_level_1
2,39798
3,27591
4,10106
1,2904
5,2528
6,96
7,18
11,16
8,9
10,8


In [32]:
df['Bathroom'] = df['Bathroom'].astype(int)


In [33]:
df['Balcony'] = df['Balcony'].replace({"> 10" : 11})
df['Balcony'].value_counts()
df['Balcony'] = df['Balcony'].astype(int)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83079 entries, 0 to 83078
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Amount_in_rupees     83079 non-null  float64
 1   Price_in_repees      83079 non-null  float64
 2   location             83079 non-null  int64  
 3   Carpet_area_in_sqft  83079 non-null  float64
 4   Transaction          83079 non-null  int64  
 5   Furnishing           83079 non-null  int64  
 6   facing               83079 non-null  int64  
 7   overlooking          83079 non-null  int64  
 8   Bathroom             83079 non-null  int64  
 9   Balcony              83079 non-null  int64  
 10  Ownership            83079 non-null  int64  
 11  current_floor        83079 non-null  int64  
 12  total_floors         83079 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 8.2 MB


In [35]:
df.head()

Unnamed: 0,Amount_in_rupees,Price_in_repees,location,Carpet_area_in_sqft,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership,current_floor,total_floors
0,4200000.0,6000.0,10,500.0,3,2,0,0,1,2,1,10,11
1,9800000.0,13799.0,10,473.0,3,1,0,0,2,2,1,3,22
2,14000000.0,17500.0,10,779.0,3,2,0,0,2,2,1,10,29
3,2500000.0,7909.079698,10,530.0,3,2,0,0,1,1,1,1,3
4,16000000.0,18824.0,10,635.0,3,2,7,1,2,2,0,20,42


###Standardization

In [36]:
from sklearn.preprocessing import StandardScaler

numeric_df = df[['Price_in_repees', 'Carpet_area_in_sqft']]

scaler = StandardScaler()
numeric_df_standardized = scaler.fit_transform(numeric_df)

df_standardized = pd.DataFrame(numeric_df_standardized, columns=numeric_df.columns)

df_dropped_numeric = df.drop(columns=['Price_in_repees', 'Carpet_area_in_sqft'])
df_combined = pd.concat([df_dropped_numeric, df_standardized], axis=1)

df_combined.head()

Unnamed: 0,Amount_in_rupees,location,Transaction,Furnishing,facing,overlooking,Bathroom,Balcony,Ownership,current_floor,total_floors,Price_in_repees,Carpet_area_in_sqft
0,4200000.0,10,3,2,0,0,1,2,1,10,11,-0.421117,-0.908393
1,9800000.0,10,3,1,0,0,2,2,1,3,22,1.299236,-0.934927
2,14000000.0,10,3,2,0,0,2,2,1,10,29,2.115627,-0.634208
3,2500000.0,10,3,2,0,0,1,1,1,1,3,0.0,-0.878911
4,16000000.0,10,3,2,7,1,2,2,0,20,42,2.407683,-0.775723


In [37]:
X = df_combined.drop(columns=['Amount_in_rupees'])
y = df_combined['Amount_in_rupees']

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

##Model Training

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [40]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBRegressor': XGBRegressor(),
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    scoree = model.score(X_test,y_test)

    print(f'{model_name}:')
    print(f'R-squared: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(f'Accuracy of Model:{scoree:.2f}')
    print('****************************************')

Random Forest:
R-squared: 0.92
Mean Absolute Error (MAE): 615549.58
Root Mean Squared Error (RMSE): 4257168.94
Accuracy of Model:0.92
****************************************
Gradient Boosting:
R-squared: 0.92
Mean Absolute Error (MAE): 1493646.79
Root Mean Squared Error (RMSE): 4507482.34
Accuracy of Model:0.92
****************************************
XGBRegressor:
R-squared: 0.92
Mean Absolute Error (MAE): 740812.10
Root Mean Squared Error (RMSE): 4370434.06
Accuracy of Model:0.92
****************************************
