In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv("housing_price_dataset.csv")
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [23]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2006.37468,3.4987,1.99542,1985.40442,224827.325151
std,575.513241,1.116326,0.815851,20.719377,76141.842966
min,1000.0,2.0,1.0,1950.0,-36588.165397
25%,1513.0,3.0,1.0,1967.0,169955.860225
50%,2007.0,3.0,2.0,1985.0,225052.141166
75%,2506.0,4.0,3.0,2003.0,279373.630052
max,2999.0,5.0,3.0,2021.0,492195.259972


In [24]:
df['Neighborhood'].value_counts()


Suburb    16721
Rural     16676
Urban     16603
Name: Neighborhood, dtype: int64

In [25]:
df['Bedrooms'].value_counts()


3    12661
5    12468
2    12436
4    12435
Name: Bedrooms, dtype: int64

In [26]:
df['Bathrooms'].value_counts()

1    16755
2    16719
3    16526
Name: Bathrooms, dtype: int64

In [27]:
df.isna().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [28]:
df['price_per_sqft']= df['Price']*100000/ df['SquareFeet']

In [29]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,price_per_sqft
0,2126,4,1,Rural,1969,215355.283618,10129600.0
1,2459,3,2,Rural,1980,195014.221626,7930631.0
2,1860,2,1,Suburb,1970,306891.012076,16499520.0
3,2294,2,1,Urban,1996,206786.787153,9014245.0
4,2130,5,2,Suburb,2001,272436.239065,12790430.0


In [30]:
df = df.drop('YearBuilt',axis=1)

In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Neighborhood'] = le.fit_transform(df['Neighborhood'])
df.head()


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,Price,price_per_sqft
0,2126,4,1,0,215355.283618,10129600.0
1,2459,3,2,0,195014.221626,7930631.0
2,1860,2,1,1,306891.012076,16499520.0
3,2294,2,1,2,206786.787153,9014245.0
4,2130,5,2,1,272436.239065,12790430.0


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [33]:
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()
X_scaled = sc.fit_transform(X)

model = LinearRegression()
model.fit(X_scaled,y)

pipe = make_pipeline(StandardScaler(), LinearRegression())
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(r2_score(y_test, y_pred))

0.959005849000046


In [34]:
from sklearn.linear_model import Lasso
lasso = Lasso()
pipe = make_pipeline(StandardScaler(), lasso)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(r2_score(y_test, y_pred))

0.9590058436683003


In [35]:
from sklearn.linear_model import Ridge
ridge = Ridge()
pipe = make_pipeline(StandardScaler(), ridge)
pipe.fit(X_train, y_train)
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)
print("Ridge: ", r2_score(y_test,y_pred_ridge))

Ridge:  0.959005785803435


In [36]:
import pickle   
pickle.dump(pipe, open('RidgeModel.pkl','wb'))

In [38]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms', 'Neighborhood', 'Price',
       'price_per_sqft'],
      dtype='object')