# House Price Prediction with Linear Regression
Predicting house prices using linear regression involves creating a model that estimates the relationship between house prices and various features

## Import Necessary Libraries

In [434]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

## Data Analysis

In [435]:
dataframe = pd.read_csv('../data/data.csv')
dataframe.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [436]:
dataframe.shape

(4600, 18)

In [437]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [438]:
dataframe.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522,1970.786304,808.608261
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228,29.731848,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [439]:
dataframe['view'].value_counts()

view
0    4140
2     205
3     116
4      70
1      69
Name: count, dtype: int64

In [440]:
dataframe['waterfront'].value_counts()

waterfront
0    4567
1      33
Name: count, dtype: int64

In [441]:
dataframe['condition'].value_counts()

condition
3    2875
4    1252
5     435
2      32
1       6
Name: count, dtype: int64

In [442]:
dataframe['street'].value_counts()

street
2520 Mulberry Walk NE     4
2500 Mulberry Walk NE     3
9413 34th Ave SW          2
6008 8th Ave NE           2
11034 NE 26th Pl          2
                         ..
1404 Broadmoor Dr E       1
3249 E Ames Lake Dr NE    1
6032 35th Ave NE          1
1006 NE Ravenna Blvd      1
18717 SE 258th St         1
Name: count, Length: 4525, dtype: int64

In [443]:
dataframe['city'].value_counts().shape

(44,)

In [444]:
dataframe['statezip'].value_counts()

statezip
WA 98103    148
WA 98052    135
WA 98117    132
WA 98115    130
WA 98006    110
           ... 
WA 98047      6
WA 98288      3
WA 98050      2
WA 98354      2
WA 98068      1
Name: count, Length: 77, dtype: int64

In [445]:
dataframe['country'].value_counts()

country
USA    4600
Name: count, dtype: int64

In [446]:
dataframe.isna().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [447]:
dataframe.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

## Dropping Some Columns
* Drop Date
* Drop View
* Drop Street
* Drop City
* Drop Statezip
* Drop Country
* Drop Sqft Basement
* Drop Year Renovated

In [448]:
new_dataframe = dataframe.drop(columns=['date','view','street','city','statezip','country','sqft_basement','yr_renovated'])
new_dataframe

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
0,3.130000e+05,3.0,1.50,1340,7912,1.5,0,3,1340,1955
1,2.384000e+06,5.0,2.50,3650,9050,2.0,0,5,3370,1921
2,3.420000e+05,3.0,2.00,1930,11947,1.0,0,4,1930,1966
3,4.200000e+05,3.0,2.25,2000,8030,1.0,0,4,1000,1963
4,5.500000e+05,4.0,2.50,1940,10500,1.0,0,4,1140,1976
...,...,...,...,...,...,...,...,...,...,...
4595,3.081667e+05,3.0,1.75,1510,6360,1.0,0,4,1510,1954
4596,5.343333e+05,3.0,2.50,1460,7573,2.0,0,3,1460,1983
4597,4.169042e+05,3.0,2.50,3010,7014,2.0,0,3,3010,2009
4598,2.034000e+05,4.0,2.00,2090,6630,1.0,0,3,1070,1974


In [449]:
new_dataframe.shape

(4600, 10)

In [450]:
new_dataframe.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
0,313000.0,3.0,1.5,1340,7912,1.5,0,3,1340,1955
1,2384000.0,5.0,2.5,3650,9050,2.0,0,5,3370,1921
2,342000.0,3.0,2.0,1930,11947,1.0,0,4,1930,1966
3,420000.0,3.0,2.25,2000,8030,1.0,0,4,1000,1963
4,550000.0,4.0,2.5,1940,10500,1.0,0,4,1140,1976


## Checking if there's a zero values in selected columns

In [451]:
zero_check = new_dataframe.iloc[:,-2:]
zero_check

Unnamed: 0,sqft_above,yr_built
0,1340,1955
1,3370,1921
2,1930,1966
3,1000,1963
4,1140,1976
...,...,...
4595,1510,1954
4596,1460,1983
4597,3010,2009
4598,1070,1974


In [452]:
(zero_check == 0).sum()

sqft_above    0
yr_built      0
dtype: int64

In [453]:
(new_dataframe['price'] == 0).sum()

49

## Re-assigning Zero Values in Price

In [454]:
new_dataframe.loc[new_dataframe['price'] == 0, 'price'] = new_dataframe['price'].median()

In [455]:
(new_dataframe['price'] == 0).sum()

0

In [456]:
new_dataframe.shape

(4600, 10)

In [457]:
new_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        4600 non-null   float64
 1   bedrooms     4600 non-null   float64
 2   bathrooms    4600 non-null   float64
 3   sqft_living  4600 non-null   int64  
 4   sqft_lot     4600 non-null   int64  
 5   floors       4600 non-null   float64
 6   waterfront   4600 non-null   int64  
 7   condition    4600 non-null   int64  
 8   sqft_above   4600 non-null   int64  
 9   yr_built     4600 non-null   int64  
dtypes: float64(4), int64(6)
memory usage: 359.5 KB


In [458]:
new_dataframe.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,556873.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,3.451739,1827.265435,1970.786304
std,561006.0,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.67723,862.168977,29.731848
min,7800.0,0.0,0.0,370.0,638.0,1.0,0.0,1.0,370.0,1900.0
25%,328158.9,3.0,1.75,1460.0,5000.75,1.0,0.0,3.0,1190.0,1951.0
50%,460971.7,3.0,2.25,1980.0,7683.0,1.5,0.0,3.0,1590.0,1976.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,4.0,2300.0,1997.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,5.0,9410.0,2014.0


## Convert into integer all the values in floor to make it appropriate

In [459]:
new_dataframe['floors'] = new_dataframe['floors'].astype(int)

## Converting Price Notation into Thousands

In [460]:
new_dataframe['price'] = (new_dataframe['price'] / 1_000)
new_dataframe

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
0,313.000000,3.0,1.50,1340,7912,1,0,3,1340,1955
1,2384.000000,5.0,2.50,3650,9050,2,0,5,3370,1921
2,342.000000,3.0,2.00,1930,11947,1,0,4,1930,1966
3,420.000000,3.0,2.25,2000,8030,1,0,4,1000,1963
4,550.000000,4.0,2.50,1940,10500,1,0,4,1140,1976
...,...,...,...,...,...,...,...,...,...,...
4595,308.166667,3.0,1.75,1510,6360,1,0,4,1510,1954
4596,534.333333,3.0,2.50,1460,7573,2,0,3,1460,1983
4597,416.904167,3.0,2.50,3010,7014,2,0,3,3010,2009
4598,203.400000,4.0,2.00,2090,6630,1,0,3,1070,1974


In [461]:
new_dataframe.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,556.873038,3.40087,2.160815,2139.346957,14852.52,1.45913,0.007174,3.451739,1827.265435,1970.786304
std,561.005967,0.908848,0.783781,963.206916,35884.44,0.552194,0.084404,0.67723,862.168977,29.731848
min,7.8,0.0,0.0,370.0,638.0,1.0,0.0,1.0,370.0,1900.0
25%,328.158929,3.0,1.75,1460.0,5000.75,1.0,0.0,3.0,1190.0,1951.0
50%,460.971731,3.0,2.25,1980.0,7683.0,1.0,0.0,3.0,1590.0,1976.0
75%,654.9625,4.0,2.5,2620.0,11001.25,2.0,0.0,4.0,2300.0,1997.0
max,26590.0,9.0,8.0,13540.0,1074218.0,3.0,1.0,5.0,9410.0,2014.0


In [462]:
new_dataframe.shape

(4600, 10)

## Assigning the Features and Target (X and y)

In [463]:
# FEATURES
X = new_dataframe.drop(columns=['price'])
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,yr_built
0,3.0,1.50,1340,7912,1,0,3,1340,1955
1,5.0,2.50,3650,9050,2,0,5,3370,1921
2,3.0,2.00,1930,11947,1,0,4,1930,1966
3,3.0,2.25,2000,8030,1,0,4,1000,1963
4,4.0,2.50,1940,10500,1,0,4,1140,1976
...,...,...,...,...,...,...,...,...,...
4595,3.0,1.75,1510,6360,1,0,4,1510,1954
4596,3.0,2.50,1460,7573,2,0,3,1460,1983
4597,3.0,2.50,3010,7014,2,0,3,3010,2009
4598,4.0,2.00,2090,6630,1,0,3,1070,1974


In [464]:
X.shape

(4600, 9)

In [465]:
# TARGET
y = new_dataframe['price']
y

0        313.000000
1       2384.000000
2        342.000000
3        420.000000
4        550.000000
           ...     
4595     308.166667
4596     534.333333
4597     416.904167
4598     203.400000
4599     220.600000
Name: price, Length: 4600, dtype: float64

## Splitting Dataset (Training Data and Testing Data)

In [466]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Object Creation

In [467]:
linear_model = LinearRegression()

## Model Training

In [468]:
linear_model.fit(X_train,y_train)

## Prediction

In [469]:
linear_model_prediction = linear_model.predict(X_test)
linear_model_prediction

array([ 505.78573057,  638.97419777,  218.66216671, ..., 1035.93340343,
        318.52383988,  351.0045642 ])

In [473]:
r2 = r2_score(y_test, linear_model_prediction)
r2

0.09661866014406917