In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Converting CSV file to Datafile as housing_data
data = pd.read_csv('housing_data.csv')

In [3]:
data

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
0,1360,1,3,4,43,259048
1,4272,1,2,4,50,723405
2,3592,3,2,3,23,605058
3,966,5,2,2,44,216077
4,4926,4,1,1,38,783922
...,...,...,...,...,...,...
495,1743,5,3,1,5,318537
496,4209,3,2,3,55,703979
497,1581,2,1,3,2,296795
498,955,1,2,4,62,213180


In [65]:
#Display the first 5 rows of the dataframe
data.head()

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
0,1360,1,3,4,43,259048
1,4272,1,2,4,50,723405
2,3592,3,2,3,23,605058
3,966,5,2,2,44,216077
4,4926,4,1,1,38,783922


In [67]:
##Display the last 5 rows of the dataframe
data.tail()

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
495,1743,5,3,1,5,318537
496,4209,3,2,3,55,703979
497,1581,2,1,3,2,296795
498,955,1,2,4,62,213180
499,1394,3,3,3,91,249899


In [69]:
#display the summary of the dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   square_footage    500 non-null    int64
 1   bedrooms          500 non-null    int64
 2   bathrooms         500 non-null    int64
 3   location_encoded  500 non-null    int64
 4   age_of_house      500 non-null    int64
 5   price             500 non-null    int64
dtypes: int64(6)
memory usage: 23.6 KB


In [7]:
#discribe the dataframe
data.describe()

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,2805.66,2.97,1.986,2.518,48.476,476615.732
std,1261.356268,1.452349,0.821678,1.126149,29.02534,191168.316793
min,504.0,1.0,1.0,1.0,1.0,116529.0
25%,1666.75,2.0,1.0,2.0,23.0,304564.0
50%,2930.0,3.0,2.0,2.0,47.0,488636.0
75%,3830.75,4.0,3.0,4.0,73.25,626042.5
max,4999.0,5.0,3.0,4.0,99.0,873581.0


In [8]:
#display the shape of data
data.shape

(500, 6)

In [71]:
##display the size of data
data.size

3000

In [73]:
#display the datatypes of data structure
data.dtypes

square_footage      int64
bedrooms            int64
bathrooms           int64
location_encoded    int64
age_of_house        int64
price               int64
dtype: object

In [11]:
#Display the Number of Unique values
data.nunique()

square_footage      471
bedrooms              5
bathrooms             3
location_encoded      4
age_of_house         98
price               500
dtype: int64

In [12]:
#Display the numm values
data.isnull()

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
495,False,False,False,False,False,False
496,False,False,False,False,False,False
497,False,False,False,False,False,False
498,False,False,False,False,False,False


In [75]:
#drop the duplicate values
data.drop_duplicates()

Unnamed: 0,square_footage,bedrooms,bathrooms,location_encoded,age_of_house,price
0,1360,1,3,4,43,259048
1,4272,1,2,4,50,723405
2,3592,3,2,3,23,605058
3,966,5,2,2,44,216077
4,4926,4,1,1,38,783922
...,...,...,...,...,...,...
495,1743,5,3,1,5,318537
496,4209,3,2,3,55,703979
497,1581,2,1,3,2,296795
498,955,1,2,4,62,213180


In [14]:
#describe the data in a tableform
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
square_footage,500.0,2805.66,1261.356268,504.0,1666.75,2930.0,3830.75,4999.0
bedrooms,500.0,2.97,1.452349,1.0,2.0,3.0,4.0,5.0
bathrooms,500.0,1.986,0.821678,1.0,1.0,2.0,3.0,3.0
location_encoded,500.0,2.518,1.126149,1.0,2.0,2.0,4.0,4.0
age_of_house,500.0,48.476,29.02534,1.0,23.0,47.0,73.25,99.0
price,500.0,476615.732,191168.316793,116529.0,304564.0,488636.0,626042.5,873581.0


In [15]:
#Extracting Independent and dependent Variable  
x= data.iloc[:, [0,1,2,3]].values  
y= data.iloc[:, 5].values

In [16]:
#display the value of x
x

array([[1360,    1,    3,    4],
       [4272,    1,    2,    4],
       [3592,    3,    2,    3],
       ...,
       [1581,    2,    1,    3],
       [ 955,    1,    2,    4],
       [1394,    3,    3,    3]], dtype=int64)

In [77]:
#display the value of y
y

array([259048, 723405, 605058, 216077, 783922, 565142, 624626, 591134,
       161240, 382567, 291402, 504803, 498121, 342263, 693545, 738684,
       577398, 236020, 253600, 519545, 445799, 533145, 268871, 388387,
       210261, 531461, 571973, 356244, 300606, 389580, 667335, 645520,
       710377, 247311, 584984, 458941, 275160, 460450, 516690, 174281,
       417822, 586661, 471617, 332731, 378871, 519764, 284175, 740702,
       162353, 555272, 423330, 337849, 686881, 565636, 251761, 667501,
       569064, 683674, 360940, 171408, 769274, 411438, 326832, 622834,
       676179, 278530, 293452, 212581, 274504, 256303, 713092, 205216,
       251831, 410898, 708378, 172238, 493265, 412006, 553011, 345509,
       166589, 856870, 619929, 427979, 618238, 185130, 585244, 514782,
       707288, 190924, 562275, 521429, 322189, 533004, 744169, 244110,
       210264, 873581, 251227, 808178, 128891, 297753, 274998, 579571,
       208933, 721653, 395971, 420605, 619253, 326656, 422468, 611683,
      

In [18]:
# Splitting the dataset into training and test set.  
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=42)

In [19]:
x_train.shape

(400, 4)

In [20]:
x_test.shape

(100, 4)

In [21]:
 y_train.shape

(400,)

In [22]:
y_test.shape

(100,)

In [23]:
# Scaling the features using StandardScaler
scaler = StandardScaler()

# Fitting and transforming the training data, and transforming the test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [24]:
x_train_scaled

array([[-0.85618246,  0.04705297, -1.20956777, -0.48591521],
       [-1.36515799, -1.34710905, -1.20956777,  0.40567233],
       [ 0.17669227,  0.74413398,  0.01221786, -0.48591521],
       ...,
       [ 0.63932589, -0.65002804,  1.23400348,  0.40567233],
       [-0.76035682,  1.44121498, -1.20956777, -1.37750276],
       [-1.15465577,  0.04705297, -1.20956777,  0.40567233]])

In [25]:
x_test_scaled

array([[ 0.11464124,  0.04705297,  1.23400348, -1.37750276],
       [-0.21289394, -1.34710905,  1.23400348, -0.48591521],
       [ 0.59219853,  1.44121498, -1.20956777, -0.48591521],
       [-0.47130899,  0.74413398, -1.20956777, -0.48591521],
       [-1.39029259,  1.44121498,  0.01221786, -1.37750276],
       [ 1.28497078,  1.44121498,  0.01221786,  1.29725988],
       [-1.12952117, -1.34710905,  1.23400348, -0.48591521],
       [ 0.76107159,  1.44121498, -1.20956777, -0.48591521],
       [-1.14287393, -0.65002804, -1.20956777,  1.29725988],
       [ 1.02341391, -1.34710905, -1.20956777,  1.29725988],
       [-0.50901088, -1.34710905,  1.23400348,  1.29725988],
       [-1.03448099, -0.65002804, -1.20956777,  0.40567233],
       [-1.34866342,  0.04705297,  1.23400348, -1.37750276],
       [ 0.601624  ,  1.44121498,  0.01221786,  0.40567233],
       [ 0.37933994,  1.44121498,  0.01221786,  1.29725988],
       [ 1.24098524,  0.74413398,  0.01221786, -1.37750276],
       [-0.85618246,  1.

In [26]:
# Initializing the LinearRegression model
lr = LinearRegression()

In [27]:
lr.fit(x_train,y_train)

In [28]:
#Making predictions on the test set
y_pred=lr.predict(x_test)

In [29]:
y_pred

array([475540.2481529 , 412154.37265121, 602014.21376057, 389499.81999884,
       207742.73988614, 774830.83534041, 237848.04799211, 634127.20416478,
       280145.92250464, 681811.14056657, 394750.33895313, 281305.12067734,
       197277.45223181, 625432.38547851, 602615.58912623, 697830.70361918,
       311482.36519179, 642989.70630405, 585884.55379633, 578614.94050322,
       226872.58342318, 662727.4727695 , 352749.67456104, 569881.43249697,
       502838.86698359, 246511.08726394, 291013.69917163, 717821.01983679,
       316647.08022888, 333222.78300361, 390591.53496128, 292086.51567349,
       814850.15585642, 514879.66163438, 603832.3750795 , 603728.61155896,
       286852.84851287, 738408.20928502, 371974.27734855, 628722.86802701,
       314570.39934869, 564343.02233749, 642157.08865652, 585434.94894724,
       255771.00100558, 403682.37096957, 271526.07344863, 467226.03573376,
       538685.92603945, 270351.53885395, 255496.56726869, 349641.85078394,
       760061.88308321, 3

In [79]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Outputting the results
print(f"\nMean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 375829565.36
R-squared: 0.99
