In [2]:
# This model is being created to predict the prices of houses in different towns. Today we are going to use these concepts in the model
# Dummy variables and One Hot Encoding 

# This model must be run on Jupiter Note although you can use Google colabs too

import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("homeprices2.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [7]:
# Pandas has a dummy method to create dummy variables for our town
dummies = pd.get_dummies(df.town, dtype=int)
dummies 
# The lower column is the dummy variables for our data frame column town

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [9]:
# Pandas concat method concatenates two-column, write first table name, then second table name and their axis of merging
merged = pd.concat([df,dummies], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [16]:
# We will drop the town and one of the dummy variable columns. The reason to drop them is that town column will not help us with linear prediction
# Same as that when we are dealing with the dummy variables we have to drop one column to save ourselves form dummy variable trap.
final = merged.drop(['town','west windsor'], axis = 'columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [27]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [28]:
# we will drop price because this is the output we want to predict by the model
X = final.drop(['price'], axis= 'columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [29]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [30]:
# when we use fit it means we are going to train our model

model.fit(X,y)

In [35]:
predicted_price = model.predict([[2800,0,1]])
print("The predicted price of house for Monroe township is = ", predicted_price)



The predicted price of house for Monroe township is =  [590775.63964739]


In [36]:
predicted_price = model.predict([[2800,0,0]])
print("The predicted price of house for West windsor is = ", predicted_price)



The predicted price of house for West windsor is =  [605103.20361213]


In [37]:
# Score method will tell the accuracy of our model
model.score(X,y)

0.9573929037221874