In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('house_prices.csv')

INSPECTING THE DATASET

In [3]:
print(df.head())  

   Size  Location  Number of Rooms   Price
0  1660     urban                4  335506
1  4572     urban                2  581608
2  3892     rural                5  608052
3  1266  suburban                1  377438
4  4244     urban                4  531108


In [7]:
print(df.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Size             3000 non-null   int64 
 1   Location         3000 non-null   object
 2   Number of Rooms  3000 non-null   int64 
 3   Price            3000 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 93.9+ KB
None


In [5]:
print(df.describe())

              Size  Number of Rooms         Price
count  3000.000000      3000.000000  3.000000e+03
mean   2943.929000         3.453667  6.085260e+05
std    1200.752576         1.705332  1.802123e+05
min     801.000000         1.000000  1.447770e+05
25%    1890.500000         2.000000  4.789830e+05
50%    2951.000000         3.000000  6.089300e+05
75%    3991.000000         5.000000  7.403642e+05
max    4999.000000         6.000000  1.067406e+06


CHECKING FOR MISSING VALUES

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

Size               0
Location           0
Number of Rooms    0
Price              0
dtype: int64


DATA PREPROCESSING

Normalize Numerical Data (Min-Max Scaling)

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Initializing MinMaxScaler
scaler = MinMaxScaler()

# Normalizing 'Size' and 'Number of Rooms' columns
df[['Size', 'Number of Rooms']] = scaler.fit_transform(df[['Size', 'Number of Rooms']])

Encode Categorical Features (One-Hot Encoding)

In [10]:
# One-Hot Encoding for the 'Location' feature
df = pd.get_dummies(df, columns=['Location'], drop_first=True)  # drop_first to avoid multicollinearity

FEATURE SELECTION

In [11]:
# Calculating correlations
correlation_matrix = df.corr()

# Displaying the correlation matrix
print(correlation_matrix)

# Identifying features with high correlation to the target variable 'Price'
high_correlation = correlation_matrix['Price'].sort_values(ascending=False)
print(high_correlation)

                       Size  Number of Rooms     Price  Location_suburban  \
Size               1.000000        -0.001334  0.640540           0.010536   
Number of Rooms   -0.001334         1.000000  0.136009           0.014975   
Price              0.640540         0.136009  1.000000          -0.012518   
Location_suburban  0.010536         0.014975 -0.012518           1.000000   
Location_urban     0.016728        -0.006251  0.033227          -0.487334   

                   Location_urban  
Size                     0.016728  
Number of Rooms         -0.006251  
Price                    0.033227  
Location_suburban       -0.487334  
Location_urban           1.000000  
Price                1.000000
Size                 0.640540
Number of Rooms      0.136009
Location_urban       0.033227
Location_suburban   -0.012518
Name: Price, dtype: float64


MODEL TRAINING

Train-Test Split

In [12]:
from sklearn.model_selection import train_test_split

# Splitting the data into features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

# Splitting the ds into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training a Linear Regression Model

In [14]:
from sklearn.linear_model import LinearRegression

# Initializing the linear regression model
model = LinearRegression()

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)
print(y_pred)

[697166.99336167 796082.57233095 604359.83671678 470432.78697094
 760893.08448167 654673.58236332 513198.21342002 700328.57647071
 584036.04147337 605192.66008251 778386.96591276 717006.84290238
 373168.02865578 738222.3146577  811698.87677859 486682.20273912
 489888.0384986  524215.85152725 627080.57784991 677439.15126509
 652124.36006761 767612.49180136 804189.89227266 466702.08922808
 393906.27983807 450827.79470725 387361.27524959 428313.49074899
 816392.34219682 456311.84378266 521672.34789027 544290.76411611
 626248.76095512 693991.3848751  746844.81404597 569805.26735879
 693120.02751765 476151.69332336 625245.44634614 398666.3300031
 542366.34642212 544748.46623449 645291.93883285 559066.94495015
 519469.82673977 651326.68193369 453436.67084895 832105.45866417
 506823.49430364 684350.1696823  782551.86363077 446842.49565111
 665868.00549072 682067.2441786  718819.04738111 615754.40324212
 771459.53982905 478089.93062375 612917.76430014 625704.15493547
 721066.827653   686097.80

MODEL EVALUATION

Evaluation Metrics

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Calculating R²
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")

RMSE: 134449.6620758342
R²: 0.44861905515069533
