# Problem statement
#### Given the features in the dataset, predict the price of the House

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
import pandas as pd 

data = pd.read_csv('real_estate_dataset.csv')

In [3]:
data

Unnamed: 0,ID,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
0,1,143.635030,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,2,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,3,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.696880
3,4,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,5,89.004660,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523
...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,138.338057,2,2,2,1967,1,0,16,4.296086,5.562583,488496.350722
496,497,195.914028,2,3,1,1977,0,1,45,7.406261,2.845105,657736.921717
497,498,69.433659,1,1,2,2004,0,0,18,8.629724,6.263264,405324.950201
498,499,293.598702,5,1,3,1940,1,0,41,5.318891,16.990684,773035.968028


## Exploratory Data Analysis (EDA) Data Exploration and Cleaning

In [4]:
data.shape

(500, 12)

In [5]:
data.isnull().sum()

ID                    0
Square_Feet           0
Num_Bedrooms          0
Num_Bathrooms         0
Num_Floors            0
Year_Built            0
Has_Garden            0
Has_Pool              0
Garage_Size           0
Location_Score        0
Distance_to_Center    0
Price                 0
dtype: int64

In [6]:
data.duplicated().sum()

0

In [7]:
data.columns

Index(['ID', 'Square_Feet', 'Num_Bedrooms', 'Num_Bathrooms', 'Num_Floors',
       'Year_Built', 'Has_Garden', 'Has_Pool', 'Garage_Size', 'Location_Score',
       'Distance_to_Center', 'Price'],
      dtype='object')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  500 non-null    int64  
 1   Square_Feet         500 non-null    float64
 2   Num_Bedrooms        500 non-null    int64  
 3   Num_Bathrooms       500 non-null    int64  
 4   Num_Floors          500 non-null    int64  
 5   Year_Built          500 non-null    int64  
 6   Has_Garden          500 non-null    int64  
 7   Has_Pool            500 non-null    int64  
 8   Garage_Size         500 non-null    int64  
 9   Location_Score      500 non-null    float64
 10  Distance_to_Center  500 non-null    float64
 11  Price               500 non-null    float64
dtypes: float64(4), int64(8)
memory usage: 47.0 KB


In [9]:
data = data.drop(columns=['ID'])   # drop unnecessary ID

In [10]:
data.head(5)

Unnamed: 0,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
0,143.63503,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.69688
3,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,89.00466,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523


In [11]:
# Detect target column
target_col = 'Price'
X = data.drop(columns=[target_col])
y = data[target_col]

In [12]:
# Convert Categorical Columns Automatically
X = pd.get_dummies(X, drop_first=True)

In [13]:
data

Unnamed: 0,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
0,143.635030,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.696880
3,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,89.004660,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523
...,...,...,...,...,...,...,...,...,...,...,...
495,138.338057,2,2,2,1967,1,0,16,4.296086,5.562583,488496.350722
496,195.914028,2,3,1,1977,0,1,45,7.406261,2.845105,657736.921717
497,69.433659,1,1,2,2004,0,0,18,8.629724,6.263264,405324.950201
498,293.598702,5,1,3,1940,1,0,41,5.318891,16.990684,773035.968028


# Feature Engineering

In [14]:
# Select features (X) and target (y)
X = data[['Square_Feet', 
          'Num_Bedrooms', 
          'Num_Bathrooms', 
          'Num_Floors',
          'Year_Built',
          'Has_Garden',
          'Has_Pool',
          'Garage_Size',
          'Location_Score',
          'Distance_to_Center']]

y = data['Price']

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3,   # 30% test
    random_state=30  # ensures same result every time
)


In [15]:
X

Unnamed: 0,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center
0,143.635030,1,3,3,1967,1,1,48,8.297631,5.935734
1,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392
2,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599
3,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019
4,89.004660,4,3,3,1999,1,0,34,1.523278,14.648277
...,...,...,...,...,...,...,...,...,...,...
495,138.338057,2,2,2,1967,1,0,16,4.296086,5.562583
496,195.914028,2,3,1,1977,0,1,45,7.406261,2.845105
497,69.433659,1,1,2,2004,0,0,18,8.629724,6.263264
498,293.598702,5,1,3,1940,1,0,41,5.318891,16.990684


In [16]:
y

0      602134.816747
1      591425.135386
2      464478.696880
3      583105.655996
4      619879.142523
           ...      
495    488496.350722
496    657736.921717
497    405324.950201
498    773035.968028
499    864299.500218
Name: Price, Length: 500, dtype: float64

# linear Regression

In [17]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [18]:
predictions

array([458712.90497931, 535117.1289958 , 662050.05345059, 580640.54702385,
       449951.68499679, 397575.94522421, 643177.1568423 , 543372.7546773 ,
       625396.72933605, 410306.03477592, 682288.62372764, 470122.94065182,
       526120.81418824, 599802.92685236, 582827.40800781, 389596.15706404,
       666363.91370313, 501698.368157  , 531307.98009228, 294402.97141564,
       669446.86622306, 632970.11109082, 625589.52258968, 792738.90311358,
       701507.49256973, 708153.81309503, 707370.83598295, 583520.24271883,
       669679.36137778, 622318.19127648, 602042.66639329, 537641.37589328,
       545448.52480872, 629406.68339543, 664553.14537285, 620341.67063691,
       627326.86048566, 820315.32166225, 564787.15286687, 432037.59035254,
       622686.33560005, 567143.17259727, 706928.75175323, 726707.3356765 ,
       336642.91210253, 356911.36942087, 479528.4871112 , 505121.59310484,
       502553.61384399, 568441.07569526, 399312.29596292, 731607.26351686,
       732102.88017691, 4

In [19]:
original = y_test

original

21     479876.364639
239    550978.377631
103    648175.924385
273    546266.789965
22     437751.643015
           ...      
32     592796.962725
238    413971.776219
188    488287.235566
463    522373.162340
352    577045.215572
Name: Price, Length: 150, dtype: float64

In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [21]:
mae =  mean_absolute_error(original,predictions)
mse = mean_squared_error(original,predictions)
r2 = r2_score(original,predictions)*100

In [22]:
mae

15091.329857432063

In [23]:
# ensure the answer is as close as possible to 1. 
r2

97.41342819741523

In [25]:
import matplotlib.pyplot as plt

plt.scatter(y_test, lin_preds)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Predicted vs Actual - Linear Regression')
plt.show()

NameError: name 'lin_preds' is not defined

# Regression Algorithms

#### Decision Tree Regressor

In [26]:
from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor()

In [27]:
DTR.fit(X_train, y_train)

In [28]:
prediction = DTR.predict(X_test)

In [29]:
prediction

array([493819.27248399, 586944.41665527, 639169.40866231, 568484.72226827,
       359989.95703789, 442615.71617951, 773035.96802766, 506721.273236  ,
       492310.60888309, 439102.99453753, 657042.15036522, 524004.12061234,
       506721.273236  , 516021.3794263 , 570260.36605797, 511009.69673147,
       635900.8402217 , 540406.72483988, 532329.5565179 , 298871.66526683,
       547258.00508106, 695642.10654902, 739265.73652637, 717354.13961934,
       715176.7624918 , 644419.72721052, 714280.62794187, 586689.04150368,
       568484.72226827, 608932.19592768, 594082.86641082, 511009.69673147,
       542276.208466  , 579720.4685698 , 673010.58948903, 557099.51024737,
       606293.98365011, 860013.25820393, 557752.83735899, 487850.94841684,
       611030.30798725, 557099.51024737, 742958.51213822, 731778.40487664,
       396497.94799452, 505392.96801439, 496952.7583596 , 507320.71494103,
       540406.72483988, 540518.37414335, 532485.19975047, 827018.65537275,
       695642.10654902, 5

In [30]:
original = y_test

original

21     479876.364639
239    550978.377631
103    648175.924385
273    546266.789965
22     437751.643015
           ...      
32     592796.962725
238    413971.776219
188    488287.235566
463    522373.162340
352    577045.215572
Name: Price, Length: 150, dtype: float64

## Evaluation Metrics

In [31]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [32]:
mae =  mean_absolute_error(original,prediction)
mse = mean_squared_error(original,prediction)
r2 = r2_score(original,prediction)*100

In [33]:
mae

57128.63211358962

In [34]:
mse

5523496729.666491

In [35]:
# ensure the answer is as close as possible to 1. 
r2

61.17470499502482

# Decision Tree Regression

In [36]:
from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor()

In [37]:
DTR.fit(X_train,y_train)

In [38]:
prediction2 = DTR.predict(X_test)

In [39]:
original.shape

(150,)

In [40]:
prediction2.shape

(150,)

In [41]:
mae2 = mean_absolute_error(original,prediction2)

In [42]:
mae2

56497.86290178724

In [43]:
r2_2 = r2_score(original,prediction2)*100

In [44]:
r2_2

61.23850785304803

# Support Vector Regression (SVR)

In [45]:
from sklearn.svm import SVR


sv = SVR()

In [46]:
sv.fit(X_train,y_train)

In [47]:
prediction3 = sv.predict(X_test)

In [48]:
mae3 = mean_absolute_error(original,prediction3)

In [49]:
mae3

94790.3717051911

In [50]:
r2_3 = r2_score(original,prediction3)*100

In [51]:
r2_3

-0.7151165430038908

# Dumping

In [52]:
import joblib

joblib.dump(LinearRegression, 'linear_regression_real_estate.joblib')
print("Linear Regression model successfully saved as 'linear_regression_real_estate.joblib'")

Linear Regression model successfully saved as 'linear_regression_real_estate.joblib'
