In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Bengaluru_House_Data.csv")  # replace with your actual file name


In [4]:
print(df.columns)


Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')


In [5]:
# Drop society only if it exists
if 'society' in df.columns:
    df.drop('society', axis=1, inplace=True)


In [6]:
# Fill missing values with most frequent values (mode) if available

if 'location' in df.columns and not df['location'].mode().empty:
    df['location'].fillna(df['location'].mode()[0], inplace=True)

if 'size' in df.columns and not df['size'].mode().empty:
    df['size'].fillna(df['size'].mode()[0], inplace=True)

if 'availability' in df.columns and not df['availability'].mode().empty:
    df['availability'].fillna(df['availability'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna(df['location'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['size'].fillna(df['size'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [7]:
# Numerical values — use median (robust to outliers)
if 'bath' in df.columns:
    df['bath'].fillna(df['bath'].median(), inplace=True)

if 'balcony' in df.columns:
    df['balcony'].fillna(df['balcony'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bath'].fillna(df['bath'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['balcony'].fillna(df['balcony'].median(), inplace=True)


In [8]:
# Extract numeric value from 'size' column (e.g., "2 BHK" → 2)
df['size'] = df['size'].str.extract('(\d+)').astype(float)


In [10]:
# Convert 'total_sqft' values to float
def convert_sqft(value):
    try:
        # If range, return average
        if '-' in str(value):
            tokens = value.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        # If it can be directly converted to float
        return float(value)
    except:
        return None  # Skip complex formats like '34.5Sq. Meter'

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)


In [11]:
# Drop rows where total_sqft is still None after conversion
df.dropna(subset=['total_sqft'], inplace=True)


💬 Why Label Encoding?
Each unique category gets a number (e.g., Whitefield → 157)

Unlike one-hot, it doesn’t blow up the number of features (especially helpful for location, which has 200+ values).

Random Forest is tree-based, so label encoding won't hurt the performance like it would in linear models.

In [12]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders for each column
le_area = LabelEncoder()
le_avail = LabelEncoder()
le_loc = LabelEncoder()

# Apply encoding
df['area_type'] = le_area.fit_transform(df['area_type'])
df['availability'] = le_avail.fit_transform(df['availability'])
df['location'] = le_loc.fit_transform(df['location'])


In [14]:
import numpy as np

# outlier removing

# Calculate Q1 and Q3
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
df_no_outliers = df[(df['price'] >= (Q1 - 1.5 * IQR)) & (df['price'] <= (Q3 + 1.5 * IQR))]

df.reset_index(drop=True, inplace=True)

print(df_no_outliers)

       area_type  availability  location  size  total_sqft  bath  balcony  \
0              3            40       416   2.0      1056.0   2.0      1.0   
1              2            80       314   4.0      2600.0   5.0      3.0   
2              0            80      1174   3.0      1440.0   2.0      3.0   
3              3            80       754   3.0      1521.0   3.0      1.0   
4              3            80       713   2.0      1200.0   2.0      1.0   
...          ...           ...       ...   ...         ...   ...      ...   
13266          3            80       234   2.0      1262.0   2.0      2.0   
13267          3            80      1174   3.0      1345.0   2.0      1.0   
13268          3            80       471   3.0      1715.0   3.0      3.0   
13271          0            80       967   2.0      1141.0   2.0      1.0   
13273          3            80       393   1.0       550.0   1.0      1.0   

        price  
0       39.07  
1      120.00  
2       62.00  
3       95.

In [15]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]


In [16]:
# List of all numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Apply outlier removal for each numerical column
for col in num_cols:
    df = remove_outliers_iqr(df, col)


In [17]:
# Remove top and bottom 1% of price to reduce noise
q1 = df['price'].quantile(0.01)
q99 = df['price'].quantile(0.99)
df = df[(df['price'] > q1) & (df['price'] < q99)]


In [18]:
# Remove rows where sqft per BHK is suspiciously low or high
df = df[df['size'] > 0]  # avoid division by zero
df['sqft_per_bhk'] = df['total_sqft'] / df['size']
df = df[(df['sqft_per_bhk'] > 300) & (df['sqft_per_bhk'] < 2000)]


In [19]:
# Group rare locations into 'Other'
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts <= 10].index
df['location'] = df['location'].apply(lambda x: 'Other' if x in rare_locations else x)


In [20]:
# Convert all location values to string
df['location'] = df['location'].astype(str)

# Now safely encode
from sklearn.preprocessing import LabelEncoder
le_loc = LabelEncoder()
df['location'] = le_loc.fit_transform(df['location'])


In [21]:
# Only select numerical columns
num_cols = ['total_sqft', 'size', 'bath', 'balcony', 'price']

# Show skewness
df[num_cols].skew()


total_sqft    0.538027
size          0.078530
bath          0.748790
balcony      -0.064654
price         0.965380
dtype: float64

In [22]:
import numpy as np

# Apply log1p to reduce skewness
df['total_sqft'] = np.log1p(df['total_sqft'])
df['price'] = np.log1p(df['price'])   # Target variable — helps improve RMSE


In [23]:
df.drop('availability', axis=1, inplace=True)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features and target
features = ['area_type', 'location', 'size', 'total_sqft', 'bath', 'balcony']
target = 'price'
X = df[features]
y = df[target]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Metrics
print("✅ R2 Score:", r2_score(y_test, y_pred))
print("✅ RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


✅ R2 Score: 0.6393270018706514
✅ RMSE: 0.23844214790875806


🚨 Important Note:
If you apply log1p to the target (price), you must reverse the log when you deploy:

In [25]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(8, 6))
# sns.scatterplot(x=y_true, y=y_pred, alpha=0.4)
# plt.xlabel("Actual Price")
# plt.ylabel("Predicted Price")
# plt.title("Actual vs Predicted Prices")
# plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')  # perfect line
# plt.grid(True)
# plt.show()


In [26]:
# Reverse log1p if needed
y_pred = np.expm1(rf.predict(X_test))
y_actual = np.expm1(y_test)

# Print first 10 actual vs predicted values
for i in range(10):
    print(f"🏠 Property {i+1}:")
    print(f"   ✅ Actual Price    : {round(y_actual.iloc[i], 2)} Lakhs")
    print(f"   🤖 Predicted Price : {round(y_pred[i], 2)} Lakhs")
    print("-" * 40)


🏠 Property 1:
   ✅ Actual Price    : 90.0 Lakhs
   🤖 Predicted Price : 106.58 Lakhs
----------------------------------------
🏠 Property 2:
   ✅ Actual Price    : 67.0 Lakhs
   🤖 Predicted Price : 63.3 Lakhs
----------------------------------------
🏠 Property 3:
   ✅ Actual Price    : 110.0 Lakhs
   🤖 Predicted Price : 103.68 Lakhs
----------------------------------------
🏠 Property 4:
   ✅ Actual Price    : 52.0 Lakhs
   🤖 Predicted Price : 54.9 Lakhs
----------------------------------------
🏠 Property 5:
   ✅ Actual Price    : 85.0 Lakhs
   🤖 Predicted Price : 92.09 Lakhs
----------------------------------------
🏠 Property 6:
   ✅ Actual Price    : 36.75 Lakhs
   🤖 Predicted Price : 40.82 Lakhs
----------------------------------------
🏠 Property 7:
   ✅ Actual Price    : 46.0 Lakhs
   🤖 Predicted Price : 46.58 Lakhs
----------------------------------------
🏠 Property 8:
   ✅ Actual Price    : 72.0 Lakhs
   🤖 Predicted Price : 83.19 Lakhs
----------------------------------------
🏠 Prop

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6178 entries, 3 to 13268
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     6178 non-null   int64  
 1   location      6178 non-null   int64  
 2   size          6178 non-null   float64
 3   total_sqft    6178 non-null   float64
 4   bath          6178 non-null   float64
 5   balcony       6178 non-null   float64
 6   price         6178 non-null   float64
 7   sqft_per_bhk  6178 non-null   float64
dtypes: float64(6), int64(2)
memory usage: 434.4 KB


In [28]:
# Define input features (X) and target variable (y)
features = ['area_type', 'location', 'size', 'total_sqft', 'bath', 'balcony']
target = 'price'

X = df[features]
y = df[target]


In [29]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train Random Forest
rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Predict on test data
y_pred = rf.predict(X_test)

# Print accuracy metrics
print("✅ R2 Score:", r2_score(y_test, y_pred))
print("✅ RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


✅ R2 Score: 0.6393270018706514
✅ RMSE: 0.23844214790875806


In [32]:
# Before training
X = df.drop("price", axis=1)
y = df["price"]

# Save the column order used
import pickle
columns_used = X.columns.tolist()
with open("columns.pkl", "wb") as f:
    pickle.dump(columns_used, f)

# Train the model
model = RandomForestRegressor()
model.fit(X, y)

# Save model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

