In [None]:
import numpy as np
import pandas as pd
import pickle
import sklearn.datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
house_dataset=pd.read_csv("Bengaluru_House_Data.csv")

In [None]:
house_dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
house_dataset=house_dataset.drop(['area_type','availability','society'], axis=1)

In [None]:
house_dataset.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [None]:
house_dataset.shape

(13320, 6)

In [None]:
house_dataset.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [None]:
house_dataset = house_dataset.dropna()
house_dataset.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [None]:
house_dataset.describe()

Unnamed: 0,bath,balcony,price
count,12710.0,12710.0,12710.0
mean,2.617309,1.584343,106.060778
std,1.226,0.817287,131.766089
min,1.0,0.0,8.0
25%,2.0,1.0,49.03
50%,2.0,2.0,70.0
75%,3.0,2.0,115.0
max,40.0,3.0,2912.0


In [None]:
# 1. Clean 'size' column
house_dataset['size'] = house_dataset['size'].astype(str).str.extract('(\d+)').astype(float)

# 2. Clean 'total_sqft' column
def convert_sqft_to_num(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

house_dataset['total_sqft'] = house_dataset['total_sqft'].apply(convert_sqft_to_num)

# 3. Drop rows with remaining nulls (after conversion)
house_dataset.dropna(inplace=True)

house_dataset['location'] = house_dataset['location'].fillna('Unknown')

# Convert to numeric labels
label_encoder = LabelEncoder()
house_dataset['location'] = label_encoder.fit_transform(house_dataset['location'])


In [None]:
with open('location_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:
X=house_dataset.drop(['price'], axis=1)
Y=house_dataset['price']

In [None]:
print(X)
print(Y)

       location  size  total_sqft  bath  balcony
0           404   2.0      1056.0   2.0      1.0
1           303   4.0      2600.0   5.0      3.0
2          1136   3.0      1440.0   2.0      3.0
3           735   3.0      1521.0   3.0      1.0
4           694   2.0      1200.0   2.0      1.0
...         ...   ...         ...   ...      ...
13314       459   3.0      1715.0   3.0      3.0
13315      1208   5.0      3453.0   4.0      0.0
13317       938   2.0      1141.0   2.0      1.0
13318       877   4.0      4689.0   4.0      1.0
13319       381   1.0       550.0   1.0      1.0

[12668 rows x 5 columns]
0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13314    112.00
13315    231.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 12668, dtype: float64


In [None]:
X.head()

Unnamed: 0,location,size,total_sqft,bath,balcony
0,404,2.0,1056.0,2.0,1.0
1,303,4.0,2600.0,5.0,3.0
2,1136,3.0,1440.0,2.0,3.0
3,735,3.0,1521.0,3.0,1.0
4,694,2.0,1200.0,2.0,1.0


In [None]:
X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.2,random_state=45)

In [None]:
print(X.shape,X_train.shape, X_test.shape)

(12668, 5) (10134, 5) (2534, 5)


In [None]:
#model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=250, max_depth=25, random_state=4)

In [None]:
model.fit(X_train, Y_train)

0,1,2
,n_estimators,250
,criterion,'squared_error'
,max_depth,25
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
train_predict=model.predict(X_train)


In [None]:
print(train_predict)

[124.34530182 244.252       29.46520413 ...  54.86192394  58.91346
  57.37299303]


In [None]:
#R square error
score1=metrics.r2_score(Y_train,train_predict)
#mean absolute error
score2=metrics.mean_absolute_error(Y_train,train_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

R square error: 0.9356616896790813
Mean Absolute error: 12.485507655112288


In [None]:
test_predict=model.predict(X_test)


In [None]:
score1=metrics.r2_score(Y_test,test_predict)
score2=metrics.mean_absolute_error(Y_test,test_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

R square error: 0.6272593458345459
Mean Absolute error: 30.32696717532973


In [None]:
# Load the encoder
with open('location_label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)



In [None]:
pickle.dump(model,open('mlmodel.pkl','wb'))
mlmode=pickle.load(open('mlmodel.pkl','rb'))