In [1]:
%matplotlib qt

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lls import LLS
import re

In [3]:
data = pd.read_csv("housePrice.csv")
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [4]:
data["Parking"].replace(["True", "False"], [1, 0], inplace=True)
data["Warehouse"].replace(["True", "False"], [1, 0], inplace=True)
data["Elevator"].replace(["True", "False"], [1, 0], inplace=True)
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1.850000e+09,61666.67
1,60,1,True,True,True,Shahran,1.850000e+09,61666.67
2,79,2,True,True,True,Pardis,5.500000e+08,18333.33
3,95,2,True,True,True,Shahrake Qods,9.025000e+08,30083.33
4,123,2,True,True,True,Shahrake Gharb,7.000000e+09,233333.33
...,...,...,...,...,...,...,...,...
3468,86,2,True,True,True,Southern Janatabad,3.500000e+09,116666.67
3469,83,2,True,True,True,Niavaran,6.800000e+09,226666.67
3470,75,2,False,False,False,Parand,3.650000e+08,12166.67
3471,105,2,True,True,True,Dorous,5.600000e+09,186666.67


In [5]:
data['Area'] = data['Area'].astype(str).apply(lambda x: re.sub(',', '', x))
data["Area"] = pd.to_numeric(data["Area"] , errors='coerce')

In [6]:
data.dropna(inplace=True)

In [7]:
exchange_rate_july_2023 = 50000 
data['Price(USD)'] = data['Price'] / exchange_rate_july_2023
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1.850000e+09,37000.0
1,60,1,True,True,True,Shahran,1.850000e+09,37000.0
2,79,2,True,True,True,Pardis,5.500000e+08,11000.0
3,95,2,True,True,True,Shahrake Qods,9.025000e+08,18050.0
4,123,2,True,True,True,Shahrake Gharb,7.000000e+09,140000.0
...,...,...,...,...,...,...,...,...
3468,86,2,True,True,True,Southern Janatabad,3.500000e+09,70000.0
3469,83,2,True,True,True,Niavaran,6.800000e+09,136000.0
3470,75,2,False,False,False,Parand,3.650000e+08,7300.0
3471,105,2,True,True,True,Dorous,5.600000e+09,112000.0


In [8]:
most_expensive_houses = data.sort_values(by='Price', ascending=False).head(5).reset_index()
print("Addresses of the 5 most expensive houses:")
print(most_expensive_houses[['Address', 'Price']])


Addresses of the 5 most expensive houses:
      Address         Price
0  Zaferanieh  9.240000e+10
1      Abazar  9.100000e+10
2     Lavasan  8.500000e+10
3  Ekhtiarieh  8.160000e+10
4    Niavaran  8.050000e+10


In [9]:
def lower_upper(x):
    Q1 = np.percentile(x, 25)
    Q3 = np.percentile(x, 75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    return lower, upper

lower_area, upper_area = lower_upper(data['Area'])
lower_price, upper_price = lower_upper(data['Price'])

In [10]:
area_outliers = np.where(data['Area'] > upper_area)
price_outliers = np.where(data['Price'] > upper_price)

total_outliers = np.union1d(area_outliers, price_outliers)

In [11]:
data = data.drop(total_outliers)

In [12]:
address_dummy = pd.get_dummies(data['Address'])
data = data.merge(address_dummy, left_index = True, right_index = True)
data.drop(columns = ["Address", "Price(USD)"], inplace = True)
data.head(3)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Abazar,Abbasabad,Absard,Abuzar,...,Waterfall,West Ferdows Boulevard,West Pars,Yaftabad,Yakhchiabad,Yousef Abad,Zafar,Zaferanieh,Zargandeh,Zibadasht
0,63,1,True,True,True,1850000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60,1,True,True,True,1850000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,79,2,True,True,True,550000000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from testtrain_split import train_test_split


X = data.drop(columns = 'Price')
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(f"shape of x train: {X_train.shape}")
print(f"shape of y train: {y_train.shape}")
print(f"shape of x test: {X_test.shape}")
print(f"shape of y train: {y_test.shape}")

shape of x train: (2471, 195)
shape of y train: (2471,)
shape of x test: (618, 195)
shape of y train: (618,)


In [14]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)


In [15]:
lls = LLS()
weights = lls.fit(X_train, y_train)
print("LLS weights:", weights)

LLS weights: [-2.47077394e+09  6.01353217e+07  7.00488648e+08 -5.70005650e+08
 -1.77479002e+07 -2.25387903e+08  3.36232045e+09  9.53033292e+08
 -2.47321712e+10 -3.68732896e+08 -1.49714284e+09 -1.23523032e+08
 -1.02550805e+09  1.04954127e+10 -8.45621749e+08  1.25756467e-01
  2.10736195e+09 -3.99669569e+09 -9.55199300e+08 -2.67512837e+09
  1.04417780e+10  2.42871955e+09 -8.35349591e+07 -2.24508447e+09
 -1.23836817e+09 -4.19136694e+08 -7.46669746e+08 -4.30026752e+09
  1.35415475e-01 -2.80301055e+09 -8.64037621e+08 -1.44815828e+09
  4.10775773e+08 -2.66035249e+09 -6.50736581e+09  1.65786647e-01
  1.87019018e+09 -1.06622871e+10  2.13596213e+09  4.53194933e+08
  1.53292429e+09 -1.46055064e+09 -1.56468321e+09  4.61898344e+09
 -6.95807442e+08  9.55768455e+09  2.70734226e+08 -5.06253193e+08
  1.34361091e+09  1.71780852e+09  1.42175995e+10 -4.96426111e+08
  2.43580808e-01  1.35880384e+08 -4.84553224e+08  1.01864328e+10
 -1.01854115e+09 -3.50337772e+07 -2.50286958e+09 -2.76408748e+09
  3.94884234

In [16]:
y_pred_test = lls.predict(X_test)
#y_pred_test = y_pred_test.flatten()
y_pred_test

array([3341496252.750988, 13765129838.373676, 10088923569.71242,
       3582294966.1501265, 3101212392.778447, 1972688339.9783163,
       -109568991.0498352, 6500623438.91834, 14584369976.981445,
       2608844758.2985916, 55811201.43618393, 994750175.7846723,
       1274465679.2670884, 5239097726.055326, 2280939418.341224,
       19028702441.597824, 837570383.165163, 6353762852.348666,
       3931952730.0123024, 16060171245.758455, 4526370705.194829,
       3594158787.6260185, 2706848348.6337996, 1387330002.727189,
       7333865065.270285, 1198382313.193923, 7400131276.899637,
       6967975269.300137, 3247014272.9236307, 895913168.8570023,
       21286392846.478096, 23415305363.41269, 595236560.4997025,
       -1482723942.8890848, -291639881.21192646, 15622124318.055767,
       5707862582.360631, 1040137487.6948853, 18925406015.87857,
       5536320151.653528, 2449999998.794055, 915657566.4305878,
       956048490.5284624, 140410634.00698638, 7000872885.099165,
       3592276011.714

In [17]:

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

fig.patch.set_facecolor('#2b2b2b')  
ax.set_facecolor('#2b2b2b')        

ax.scatter(X_test[:, 0], X_test[:, 1], y_test, c='green', marker='o', label='Actual Test Data', alpha=0.7)

x1_range = np.linspace(X_test[:, 0].min(), X_test[:, 0].max(), 10)  
x2_range = np.linspace(X_test[:, 1].min(), X_test[:, 1].max(), 10) 
x1_grid, x2_grid = np.meshgrid(x1_range, x2_range)

y_pred_plane = weights[0] + weights[1] * x1_grid + weights[2] * x2_grid

ax.plot_surface(x1_grid, x2_grid, y_pred_plane, color='yellow', alpha=0.5)

ax.set_title('Real Estate Price Prediction', fontsize=14, color='#e0e0e0')
ax.set_xlabel('Area (sqm)', fontsize=12, color='#e0e0e0')
ax.set_ylabel('Number of Rooms', fontsize=12, color='#e0e0e0')
ax.set_zlabel('Price', fontsize=12, color='#e0e0e0')

ax.tick_params(axis='x', colors='#e0e0e0')
ax.tick_params(axis='y', colors='#e0e0e0')
ax.tick_params(axis='z', colors='#e0e0e0')
ax.grid(color='#555555', linestyle='--', linewidth=0.5, alpha=0.6)

ax.legend(loc='upper left', fontsize=10, facecolor='#2b2b2b', edgecolor='#555555', framealpha=0.8)

plt.tight_layout()
plt.show()


In [18]:
mae, mse, rmse, r2 = lls.evaluate(y_test, y_pred_test)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²) Score:", r2)

Mean Absolute Error (MAE): 2015576756.791834
Mean Squared Error (MSE): 2.0161375915022418e+19
Root Mean Squared Error (RMSE): 4490142081.830197
R-squared (R²) Score: 0.5648710939385799


In [19]:
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Linear Regression - MAE:", mae_linear)
print("Linear Regression - MSE:", mse_linear)
print("Linear Regression - RMSE:", rmse_linear)
print("Linear Regression - R2 Score:", r2_linear)

ridge_model = RidgeCV() 
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression - MAE:", mae_ridge)
print("Ridge Regression - MSE:", mse_ridge)
print("Ridge Regression - RMSE:", rmse_ridge)
print("Ridge Regression - R2 Score:", r2_ridge)


Linear Regression - MAE: 2015418643.108126
Linear Regression - MSE: 2.016063719791692e+19
Linear Regression - RMSE: 4490059821.1958065
Linear Regression - R2 Score: 0.6969272292761572
Ridge Regression - MAE: 1999349713.0853157
Ridge Regression - MSE: 1.8548444448092664e+19
Ridge Regression - RMSE: 4306790504.319041
Ridge Regression - R2 Score: 0.72116315589065
