In [202]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [204]:
# Load dataset
df = pd.read_csv("C:\\Users\\Muzamil Hussain\\Desktop\\Housing_data.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3600,565000
2,monroe township,4000,610000
3,monroe township,5000,680000
4,monroe township,5600,725000
5,west windsor,3000,585000
6,west windsor,2500,615000
7,west windsor,3800,650000
8,west windsor,5200,710000
9,robinesville,2200,575000


In [206]:
# One-hot encode the 'town' column
dummies = pd.get_dummies(df['town'], dtype=int)
dummies

Unnamed: 0,monroe township,robinesville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [208]:
# Merge with original dataframe
merge = pd.concat([df, dummies], axis=1)
merge

Unnamed: 0,town,area,price,monroe township,robinesville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3600,565000,1,0,0
2,monroe township,4000,610000,1,0,0
3,monroe township,5000,680000,1,0,0
4,monroe township,5600,725000,1,0,0
5,west windsor,3000,585000,0,0,1
6,west windsor,2500,615000,0,0,1
7,west windsor,3800,650000,0,0,1
8,west windsor,5200,710000,0,0,1
9,robinesville,2200,575000,0,1,0


In [210]:
# Drop original 'town' column and avoid dummy trap
final = merge.drop(['town', 'west windsor'], axis=1)
final

Unnamed: 0,area,price,monroe township,robinesville
0,2600,550000,1,0
1,3600,565000,1,0
2,4000,610000,1,0
3,5000,680000,1,0
4,5600,725000,1,0
5,3000,585000,0,0
6,2500,615000,0,0
7,3800,650000,0,0
8,5200,710000,0,0
9,2200,575000,0,1


In [212]:
# Features (X) 
X = final.drop('price', axis=1)
X


Unnamed: 0,area,monroe township,robinesville
0,2600,1,0
1,3600,1,0
2,4000,1,0
3,5000,1,0
4,5600,1,0
5,3000,0,0
6,2500,0,0
7,3800,0,0
8,5200,0,0
9,2200,0,1


In [214]:
#target (Y)
Y = final['price']
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [216]:
# Train the model
model = LinearRegression()
model.fit(X, Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [218]:
# Prepare new data with same column names as X
new_data = pd.DataFrame([[2500, 0, 0]], columns=X.columns)
new_data

Unnamed: 0,area,monroe township,robinesville
0,2500,0,0


In [220]:
# Predict
prediction = model.predict(new_data)
print("Prediction:", prediction)

Prediction: [588644.01151207]


In [222]:
model.score(X,Y)

0.8443974979869785

In [224]:
#inorder to use one hot encoding
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3600,565000
2,monroe township,4000,610000
3,monroe township,5000,680000
4,monroe township,5600,725000
5,west windsor,3000,585000
6,west windsor,2500,615000
7,west windsor,3800,650000
8,west windsor,5200,710000
9,robinesville,2200,575000


In [226]:
#in here we use label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [252]:
#so first lets create a new data frame
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3600,565000
2,0,4000,610000
3,0,5000,680000
4,0,5600,725000
5,2,3000,585000
6,2,2500,615000
7,2,3800,650000
8,2,5200,710000
9,1,2200,575000


In [250]:
X = dfle[['town','area']].values
X

array([[   0, 2600],
       [   0, 3600],
       [   0, 4000],
       [   0, 5000],
       [   0, 5600],
       [   2, 3000],
       [   2, 2500],
       [   2, 3800],
       [   2, 5200],
       [   1, 2200],
       [   1, 3500],
       [   1, 4600],
       [   1, 5500]], dtype=int64)

In [254]:
y = dfle.price.values
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [256]:
#lets import OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')



In [258]:
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 5.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 5.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.5e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 5.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.5e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.5e+03]])

In [260]:
X = X[:,1:]

In [262]:
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 5.0e+03],
       [0.0e+00, 0.0e+00, 5.6e+03],
       [0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 2.5e+03],
       [0.0e+00, 1.0e+00, 3.8e+03],
       [0.0e+00, 1.0e+00, 5.2e+03],
       [1.0e+00, 0.0e+00, 2.2e+03],
       [1.0e+00, 0.0e+00, 3.5e+03],
       [1.0e+00, 0.0e+00, 4.6e+03],
       [1.0e+00, 0.0e+00, 5.5e+03]])

In [264]:
model.fit(X,y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [266]:
model.predict([[0,1,3400]]) # 3400 sqr ft home in west windsor

array([629728.80230239])

In [268]:
model.predict([[1,0,2800]]) # 2800 sqr ft home in robbinsville

array([570002.76732339])