# importing the libraries

In [3]:
import pandas as pd
import numpy as np

# importing the Dataset

In [4]:
df= pd.read_csv(r"C:\Users\Manisha\DATA SCIENCE\housing_price_dataset.csv")

In [5]:
df.dtypes

SquareFeet        int64
Bedrooms          int64
Bathrooms         int64
Neighborhood     object
YearBuilt         int64
Price           float64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


# eda

In [7]:
df.isna().sum() #checking null values

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [8]:
df.duplicated().sum() #checking duplicate values

0

In [9]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


# Data Preprocessing

### 
1. onehot - nominal data
2. ordinal - ordinal data

In [10]:
df['Neighborhood']=df['Neighborhood'].map({'Rural':1,'Suburb':2,'Urban':3}) #ordinal encoding (categorical to numerical)

In [11]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,1,1969,215355.283618
1,2459,3,2,1,1980,195014.221626
2,1860,2,1,2,1970,306891.012076
3,2294,2,1,3,1996,206786.787153
4,2130,5,2,2,2001,272436.239065


# Model building

#### training and testing the machine
- splitting the input and output

In [12]:
x = df.drop('Price',axis = 1) #input

In [13]:
x

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt
0,2126,4,1,1,1969
1,2459,3,2,1,1980
2,1860,2,1,2,1970
3,2294,2,1,3,1996
4,2130,5,2,2,2001
...,...,...,...,...,...
49995,1282,5,3,1,1975
49996,2854,2,2,2,1988
49997,2979,5,3,2,1962
49998,2596,5,2,1,1984


In [14]:
y = df['Price'] #output

In [15]:
# !pip install scikit-learn

#### splitting the data as training data and testing data
- x into training and testing data
- y into training and testing data

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train_test_split(x,y,test_size=0.2) #randomly splits the data and expects train or test data size

[       SquareFeet  Bedrooms  Bathrooms  Neighborhood  YearBuilt
 3454         2445         4          3             2       1992
 401          2135         3          1             2       1955
 9490         2686         5          1             2       1971
 13715        1922         2          3             2       2021
 18487        2237         2          2             1       2019
 ...           ...       ...        ...           ...        ...
 48513        1111         5          2             1       2004
 45225        2182         3          2             1       1963
 10117        2625         4          2             1       1984
 127          1187         5          3             2       1957
 49801        1315         2          2             3       2018
 
 [40000 rows x 5 columns],
        SquareFeet  Bedrooms  Bathrooms  Neighborhood  YearBuilt
 756          2417         4          3             2       2001
 43691        2610         2          3             2       1

In [18]:
# to make it constant data
train_test_split(x,y,test_size=0.2,random_state=23)

[       SquareFeet  Bedrooms  Bathrooms  Neighborhood  YearBuilt
 20198        2169         2          3             1       1988
 34103        2568         3          2             2       1977
 40179        1339         3          3             1       1958
 34586        2716         2          1             1       1978
 30725        2307         3          1             1       1974
 ...           ...       ...        ...           ...        ...
 9704         2788         4          3             2       1986
 11190        2557         5          1             2       2004
 26569        2344         3          2             1       1961
 9256         1105         5          2             2       2008
 41555        2248         2          3             3       2020
 
 [40000 rows x 5 columns],
        SquareFeet  Bedrooms  Bathrooms  Neighborhood  YearBuilt
 49466        1055         3          2             1       1969
 11621        1754         4          3             1       1

In [19]:
# unpacking
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=23)

In [20]:
x_train.shape #80% of the input goes for training the machine

(40000, 5)

In [21]:
x_test.shape #10% of input for testing

(10000, 5)

In [22]:
y_train.shape #80% of the ouput goes fpr training the machine

(40000,)

In [23]:
y_test.shape #10% of output for testing

(10000,)

## ML algorithms
- KNN
- Linear Regression
- SVM - SVR 
- DT

## KNN

In [24]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [25]:
## KNeighborsRegressor is the class in sklearn.neighbors , we should create an object for it.
## If a name consists of a capital letter it is a class.

In [26]:
#creating the object 'knn'
knn = KNeighborsRegressor()

In [27]:
knn.fit(x_train,y_train)

In [28]:
y_pred = knn.predict(x_test)

In [29]:
np.sqrt(mean_squared_error(y_test,y_pred)) # mean squareroot error

54247.93901533543

## Linear Regression

In [30]:
from sklearn.linear_model import LinearRegression

In [31]:
lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred =lr.predict(x_test)

In [32]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))

49403.976927995944


## Support Vector Regression

In [33]:
from sklearn.svm import SVR

In [34]:
# sv = SVR()
# sv.fit(x_train,y_train)
# y_pred =sv.predict(x_test)

In [35]:
# print(np.sqrt(mean_squared_error(y_test,y_pred)))

## Decision Tree

In [36]:
from sklearn.tree import DecisionTreeRegressor

In [37]:
dt = DecisionTreeRegressor()
dt.fit(x_train,y_train)
y_pred =dt.predict(x_test)

In [38]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))

72322.49389528675


In [39]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,1,1969,215355.283618
1,2459,3,2,1,1980,195014.221626
2,1860,2,1,2,1970,306891.012076
3,2294,2,1,3,1996,206786.787153
4,2130,5,2,2,2001,272436.239065


In [40]:
lr.predict([[2622,3,3,3,2001]])[0]



286444.75528787763

In [41]:
lr.coef_

array([  99.31308607, 5168.85659283, 2625.12603247,  752.26187692,
        -13.64933459])

In [42]:
lr.intercept_

27719.42860046134

In [43]:
#lr fromula
#y = m1x1+m2x2+m3x3...+c where m is the coef and x is the imput parameter

In [45]:
#Pattern of House price prediction using Linear Regresssion
sqft = 2622
bed = 3
bath = 3
nb = 3
year = 2001
price = 99.31308607 * sqft +  5168.85659283 *bed + 2625.12603247*bath +752.26187692* nb -13.64933459 * year + 27719.4286004623
print(price)

286444.75526807236


## PICKLING

In [1]:
import pickle

In [46]:
pickle.dump(lr,open("lr.pkl","wb")) #writing in binary format

In [47]:
model = pickle.load(open("lr.pkl","rb"))

In [48]:
model.predict([[2622,3,3,3,2001]])



array([286444.75528788])