In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import datetime
from scipy import stats
from scipy.stats import zscore
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder
from folium.plugins import MarkerCluster
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [7]:
pd.options.display.max_columns = None
df = pd.read_csv('zameen-updated.csv')

In [8]:
missing_values = df.isnull().sum()
print(missing_values)

property_id          0
location_id          0
page_url             0
property_type        0
price                0
location             0
city                 0
province_name        0
latitude             0
longitude            0
baths                0
area                 0
purpose              0
bedrooms             0
date_added           0
agency           44071
agent            44072
Area Type            0
Area Size            0
Area Category        0
dtype: int64


In [12]:
df['agency'].fillna('NA', inplace=True)
df['agent'].fillna('NA', inplace=True)

In [14]:
df.isnull().sum() 

property_id      0
location_id      0
page_url         0
property_type    0
price            0
location         0
city             0
province_name    0
latitude         0
longitude        0
baths            0
area             0
purpose          0
bedrooms         0
date_added       0
agency           0
agent            0
Area Type        0
Area Size        0
Area Category    0
dtype: int64

In [24]:
categories = ['page_url', 'property_type', 'location', 'city', 'province_name', 'purpose']
for c in categories:
 df[c] = df[c].str.strip().str.lower()

In [26]:
df = df[df['price'] > 0]

In [28]:
df = df[(df['latitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]

In [30]:
df = df[df['baths'] >= 0]
df = df[df['bedrooms'] >= 0]
duplicates = df.duplicated().sum()
print(duplicates)

0


In [32]:
nc = ['price', 'latitude', 'longitude', 'baths', 'bedrooms', 'Area Size']
count = {}
for c in nc:
    q1 = df[c].quantile(0.25)
    q3 = df[c].quantile(0.75)
    iqr = q3 - q1
    lb = q1 - 1.5 * iqr
    ub = q3 + 1.5 * iqr
    outlier = (df[c] < lb) | (df[c] > ub)
    count[c] = outlier.sum()
print(count)

{'price': 13547, 'latitude': 9, 'longitude': 6, 'baths': 22, 'bedrooms': 3319, 'Area Size': 3326}


In [34]:
df['price'] = winsorize(df['price'], limits=[0.01, 0.01])
df['latitude'] = winsorize(df['latitude'], limits=[0.05, 0.05])
df['longitude'] = winsorize(df['longitude'], limits=[0.05, 0.05])
df['baths'] = winsorize(df['baths'], limits=[0.05, 0.05])
df['bedrooms'] = winsorize(df['bedrooms'], limits=[0.05, 0.05])
df['Area Size'] = winsorize(df['Area Size'], limits=[0.05, 0.05])

In [38]:
price_q1 = df['price'].quantile(0.25)
price_q3 = df['price'].quantile(0.75)
price_iqr = price_q3 - price_q1
price_lb = price_q1 - 1.5 * price_iqr
price_ub = price_q3 + 1.5 * price_iqr
price_mean = df['price'].mean()
df.loc[df['price'] > price_ub, 'price'] = price_mean
df.loc[df['price'] < price_lb, 'price'] = price_mean

  arr.partition(
  arr.partition(
  df.loc[df['price'] > price_ub, 'price'] = price_mean


In [40]:
outlierUpdated = (df[nc] < price_lb) | (df[nc] > price_ub)
updatedCountOutlier = outlierUpdated.sum()
print(updatedCountOutlier)

price        0
latitude     0
longitude    0
baths        0
bedrooms     0
Area Size    0
dtype: int64


In [42]:
df['agency'] = df['agency'].str.strip().str.title()
df['agency'].fillna('NA', inplace=True)

In [44]:
lat_mean = df['latitude'].mean()
lon_mean = df['longitude'].mean()
m = folium.Map(location=[lat_mean, lon_mean], zoom_start=6)
marker_cluster = MarkerCluster().add_to(m)
for _, row in df.iterrows():
 folium.Marker(
 location=[row['latitude'], row['longitude']],
 popup=f"Price: {row['price']}, Beds: {row['bedrooms']}, Baths: {row['baths']}"
 ).add_to(marker_cluster)
map_filename = 'house_price_map.html'
m.save(map_filename)

In [45]:
from datetime import datetime
currYear = datetime.now().year
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['house_age'] = currYear - df['date_added'].dt.year // 365
df['house_age'].fillna(df['house_age'].median(), inplace=True)

In [46]:
def convert_area(area):
 if isinstance(area, str):
     area = area.replace(',', '')
 if 'Marla' in area:
     value = float(area.split()[0])
     return value * 272.25
 elif 'Kanal' in area:
     value = float(area.split()[0])
     return value * 20 * 272.25
 elif 'Square Feet' in area:
     value = float(area.split()[0])
     return value
 return area
df['area'] = df['area'].apply(convert_area)

In [47]:
df['area'] = df['area'].astype(str)
df['area'] = df['area'].str.replace(' Marla', '', regex=False).str.replace(',', '').astype(float)
print(df[['area']].head())

      area
0   1089.0
1   1524.6
2   2178.0
3  10890.0
4   2178.0


In [80]:
df['priceToSqft'] = df['price'] / df['area']

In [48]:
df['bedroomToFloor'] = df['bedrooms'] / df['area']
df['bedroomToFloor'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['bedroomToFloor'].fillna(df['bedroomToFloor'].median(), inplace=True)

In [49]:
df['bathToBedroom'] = df['baths'] / df['bedrooms']
df['bathToBedroom'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['bathToBedroom'].fillna(df['bathToBedroom'].median(), inplace=True)

In [50]:
print(df.columns)

Index(['property_id', 'location_id', 'page_url', 'property_type', 'price',
       'location', 'city', 'province_name', 'latitude', 'longitude', 'baths',
       'area', 'purpose', 'bedrooms', 'date_added', 'agency', 'agent',
       'Area Type', 'Area Size', 'Area Category', 'house_age',
       'bedroomToFloor', 'bathToBedroom'],
      dtype='object')


In [51]:
categorical_columns = ['property_type', 'location', 'city', 'province_name', 'purpose', 'agency', 'agent', 'Area Type', 'Area Category']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [59]:
df['price_zscore'] = zscore(df['price'])
outlierHigh = df[df['price_zscore'] > 3]
outlierLow = df[df['price_zscore'] < -3]

In [61]:
features = ['area', 'bedrooms', 'baths', 'house_age', 'latitude', 'longitude']
target = 'price'
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
lrModel = LinearRegression()
lrModel.fit(X_train, y_train)

In [76]:
yPredLR = lrModel.predict(X_test)
maeLR = mean_absolute_error(y_test, yPredLR)
mseLR = mean_squared_error(y_test, yPredLR)
r2LR = r2_score(y_test, yPredLR)
print("MAE:", maeLR)
print("MSE:", mseLR)
print("R2:", r2LR)

MAE: 7124822.044877812
MSE: 92482556338327.88
R2: 0.24622447667397995


In [72]:
rfModel = RandomForestRegressor(n_estimators=100, random_state=42)
rfModel.fit(X_train, y_train)

In [78]:
yPredRF = rfModel.predict(X_test)
maeRF = mean_absolute_error(y_test, yPredRF)
mseRF = mean_squared_error(y_test, yPredRF)
r2RF = r2_score(y_test, yPredRF)
print("MAE:", maeRF)
print("MSE:", mseRF)
print("R2:", r2RF)

MAE: 4124486.804822419
MSE: 49980837246734.59
R2: 0.5926331057058325


In [80]:
gbModel = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbModel.fit(X_train, y_train)

In [86]:
yPredGB = gbModel.predict(X_test)
maeGB = mean_absolute_error(y_test, yPredGB)
mseGB = mean_squared_error(y_test, yPredGB)
r2GB = r2_score(y_test, yPredGB)
print("MAE:", maeGB)
print("MSE:", mseGB)
print("R2:", r2GB)

MAE: 5580709.0079267025
MSE: 62483769795104.34
R2: 0.4907284341883179
